Commit 132dd2ca authored by Aneesh's avatar Aneesh
Browse files

wrote load from two files function to handle nci dataset

parent 6c588b20
Loading
Loading
Loading
Loading

datasets/nci_1.csv.gz

0 → 100644
+51.6 MiB

File added.

No diff preview for this file type.

datasets/nci_2.csv.gz

0 → 100644
+51.6 MiB

File added.

No diff preview for this file type.

+7 −5
Original line number Diff line number Diff line
@@ -11,7 +11,7 @@ import os
import sys
import numpy as np
import shutil
from deepchem.utils.save import load_from_disk
from deepchem.utils.save import load_twofiles_from_disk
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.fingerprints import CircularFingerprint
@@ -40,10 +40,12 @@ def load_nci(base_dir, reload=True):

  # Load nci dataset
  print("About to load NCI dataset.")
  dataset_file = os.path.join(
      current_dir, "../../datasets/nci.csv.gz")
  print(dataset_file)
  dataset = load_from_disk(dataset_file)
  dataset_file1_path = os.path.join(
      current_dir, "../../datasets/nci_1.csv.gz")
  dataset_file2_path = os.path.join(
      current_dir, "../../datasets/nci_2.csv.gz")

  dataset = load_twofiles_from_disk(dataset_file1_path, dataset_file2_path)
  print("Columns of dataset: %s" % str(dataset.columns.values))
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))

+24 −0
Original line number Diff line number Diff line
@@ -42,6 +42,30 @@ def load_from_disk(filename):
  else:
    raise ValueError("Unrecognized filetype for %s" % filename)

# Only handles *.csv.gz files
def load_twofiles_from_disk(filename1, filename2):
  """Load a dataset from file."""
  name1 = filename1
  name2 = filename2
  filenameList = []
  filenameList.append(name1)
  filenameList.append(name2)
  dataframeList = []
  for name in filenameList:
    placeholderName = name
    if os.path.splitext(name)[1] == ".gz":
      #pandas read_csv() method handles gzipped csv files 
      name = os.path.splitext(name)[0]
    if os.path.splitext(name)[1] == ".csv":
      # First line of user-specified CSV *must* be header.
      df = pd.read_csv(placeholderName, header=0)
      df = df.replace(np.nan, str(""), regex=True)
      dataframeList.append(df)
    else:
      raise ValueError("Unrecognized filetype for %s" % filename)
  combined_df = dataframeList[0].append(dataframeList[1])
  return combined_df

def load_pickle_from_disk(filename):
  """Load dataset from pickle file."""
  if ".gz" in filename: