Commit e7db067b authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Fixes

parent 2246d0a7
Loading
Loading
Loading
Loading
+7 −4
Original line number Diff line number Diff line
@@ -55,7 +55,7 @@ def load_data(input_file, shard_size=None):
def _load_sdf_file(input_file):
  """Load SDF file into dataframe."""
  # Tasks are stored in .sdf.csv file
  raw_df = _load_csv_file(input_file+".csv")
  raw_df = _load_csv_file(input_file+".csv", shard_size=None).next()
  # Structures are stored in .sdf file
  print("Reading structures from %s." % input_file)
  suppl = Chem.SDMolSupplier(str(input_file), removeHs=False)
@@ -71,7 +71,10 @@ def _load_sdf_file(input_file):
def _load_csv_file(filename, shard_size=None):
  """Load data as pandas dataframe."""
  # First line of user-specified CSV *must* be header.
  for df in pd.read_csv(filename, header=0, chunksize=shard_size):
  if shard_size is None:
    yield pd.read_csv(filename)
  else:
    for df in pd.read_csv(filename, chunksize=shard_size):
      df = df.replace(np.nan, str(""), regex=True)
      yield df

+0 −2
Original line number Diff line number Diff line
"""
Manipulate CSV data files.
"""

from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
@@ -10,7 +9,6 @@ import pandas as pd
import csv
import argparse


def parse_args(input_args=None):
  parser = argparse.ArgumentParser()
  parser.add_argument(
+56 −0
Original line number Diff line number Diff line
"""
Splits large CSVs into multiple shards.
"""
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

import argparse
import gzip
import pandas as pd

def parse_args(input_args=None):
  parser = argparse.ArgumentParser()
  parser.add_argument(
    "--csv-file", required=1,
    help="Name of input CSV file.")
  parser.add_argument(
    "--shard-size", required=1, type=int,
    help="Number of shards to split file into.")
  parser.add_argument(
    "--out", required=1,
    help="Root name of output CSV shards.")
  parser.add_argument(
    "--gzip-output", action="store_true",
    help="Gzip the output.")
  return parser.parse_args(input_args)

def shard_csv(input_file, shard_size, out_name, gzip_output):
  """Shard the csv file into multiple shards."""
  compression = "gzip" if gzip_output else None
  file_obj = None
  try:
    if input_file.endswith(".gz"):
      file_obj = gzip.open(input_file)
    else:
      file_obj = open(input_file)
    for shard_num, df_shard in enumerate(
        pd.read_csv(input_file, index_col=0, chunksize=shard_size)):
      suffix = "czv.gz" if gzip_output else "csv"
      output_name = "%s_%d.%s" % (out_name, shard_num, suffix)
      print("Writing output to %s" % output_name)
      df_shard.to_csv(output_name, compression=compression)
  finally:
    if file_obj is not None:
      file_obj.close()

def main():
  args = parse_args()
  input_file = args.csv_file
  shard_size = args.shard_size
  out_name = args.out
  gzip_output = args.gzip_output
  shard_csv(input_file, shard_size, out_name, gzip_output)

if __name__ == '__main__':
  main()
+1 −1
Original line number Diff line number Diff line
@@ -168,7 +168,7 @@ class SpecifiedSplitter(Splitter):

  def __init__(self, input_file, split_field, verbosity=None):
    """Provide input information for splits."""
    raw_df = load_data(input_file)
    raw_df = load_data(input_file, shard_size=None).next()
    self.splits = raw_df[split_field].values
    self.verbosity = verbosity