Commit b0b5c118 authored by Aneesh's avatar Aneesh
Browse files

fixing small issues

parent 92fcde86
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -11,7 +11,7 @@ import os
import sys
import numpy as np
import shutil
from deepchem.utils.save import load_multfiles_from_disk
from deepchem.utils.save import load_sharded_csv
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.fingerprints import CircularFingerprint
@@ -45,7 +45,7 @@ def load_nci(base_dir, reload=True):
  dataset_file2_path = os.path.join(
      current_dir, "../../datasets/nci_2.csv.gz")
  dataset_paths = [dataset_file1_path, dataset_file2_path]
  dataset = load_multfiles_from_disk(dataset_paths)
  dataset = load_sharded_csv(dataset_paths)
  print("Columns of dataset: %s" % str(dataset.columns.values))
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))

+9 −33
Original line number Diff line number Diff line
@@ -42,49 +42,25 @@ def load_from_disk(filename):
  else:
    raise ValueError("Unrecognized filetype for %s" % filename)

# Only handles *.csv.gz files
def load_twofiles_from_disk(filename1, filename2):
  """Load a dataset from file."""
  name1 = filename1
  name2 = filename2
  filenameList = []
  filenameList.append(name1)
  filenameList.append(name2)
  dataframeList = []
  for name in filenameList:
    placeholderName = name
    if os.path.splitext(name)[1] == ".gz":
      #pandas read_csv() method handles gzipped csv files 
      name = os.path.splitext(name)[0]
    if os.path.splitext(name)[1] == ".csv":
      # First line of user-specified CSV *must* be header.
      df = pd.read_csv(placeholderName, header=0)
      df = df.replace(np.nan, str(""), regex=True)
      dataframeList.append(df)
    else:
      raise ValueError("Unrecognized filetype for %s" % filename)
  combined_df = dataframeList[0].append(dataframeList[1])
  return combined_df
 
def load_multfiles_from_disk(filenameList):
def load_sharded_csv(filenames):
  """Load a dataset from multiple files. Each file MUST have same column headers"""
  dataframeList = []
  for name in filenameList:
    placeholderName = name
  dataframes = []
  for name in filenames:
    placeholder_name = name
    if os.path.splitext(name)[1] == ".gz":
      name = os.path.splitext(name)[0]
    if os.path.splitext(name)[1] == ".csv":
      # First line of user-specified CSV *must* be header.
      df = pd.read_csv(placeholderName, header=0)
      df = pd.read_csv(placeholder_name, header=0)
      df = df.replace(np.nan, str(""), regex=True)
      dataframeList.append(df)
      dataframes.append(df)
    else:
      raise ValueError("Unrecognized filetype for %s" % filename)
  
  #combine dataframes
  combined_df = dataframeList[0]
  for i in range(0, len(dataframeList) - 1):
    combined_df = combined_df.append(dataframeList[i+1])
  combined_df = dataframes[0]
  for i in range(0, len(dataframes) - 1):
    combined_df = combined_df.append(dataframes[i+1])
  combined_df = combined_df.reset_index(drop=True)  
  return combined_df