Commit 92fcde86 authored by Aneesh's avatar Aneesh
Browse files

changed load disk method to accept list of multiple csv files and other random small issues

parent 132dd2ca
Loading
Loading
Loading
Loading
+13 −13
Original line number Diff line number Diff line
@@ -11,7 +11,7 @@ import os
import sys
import numpy as np
import shutil
from deepchem.utils.save import load_twofiles_from_disk
from deepchem.utils.save import load_multfiles_from_disk
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.fingerprints import CircularFingerprint
@@ -44,8 +44,8 @@ def load_nci(base_dir, reload=True):
      current_dir, "../../datasets/nci_1.csv.gz")
  dataset_file2_path = os.path.join(
      current_dir, "../../datasets/nci_2.csv.gz")

  dataset = load_twofiles_from_disk(dataset_file1_path, dataset_file2_path)
  dataset_paths = [dataset_file1_path, dataset_file2_path]
  dataset = load_multfiles_from_disk(dataset_paths)
  print("Columns of dataset: %s" % str(dataset.columns.values))
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))

+23 −0
Original line number Diff line number Diff line
@@ -66,6 +66,28 @@ def load_twofiles_from_disk(filename1, filename2):
  combined_df = dataframeList[0].append(dataframeList[1])
  return combined_df
 
def load_multfiles_from_disk(filenameList):
  """Load a dataset from multiple files. Each file MUST have same column headers"""
  dataframeList = []
  for name in filenameList:
    placeholderName = name
    if os.path.splitext(name)[1] == ".gz":
      name = os.path.splitext(name)[0]
    if os.path.splitext(name)[1] == ".csv":
      # First line of user-specified CSV *must* be header.
      df = pd.read_csv(placeholderName, header=0)
      df = df.replace(np.nan, str(""), regex=True)
      dataframeList.append(df)
    else:
      raise ValueError("Unrecognized filetype for %s" % filename)
  
  #combine dataframes
  combined_df = dataframeList[0]
  for i in range(0, len(dataframeList) - 1):
    combined_df = combined_df.append(dataframeList[i+1])
  combined_df = combined_df.reset_index(drop=True)  
  return combined_df

def load_pickle_from_disk(filename):
  """Load dataset from pickle file."""
  if ".gz" in filename:
@@ -75,3 +97,4 @@ def load_pickle_from_disk(filename):
    with open(filename, "rb") as f:
      df = pickle.load(f)
  return df
+0 −2
Original line number Diff line number Diff line
@@ -25,7 +25,6 @@ np.random.seed(123)
reload = False
verbosity = "high"

#base_data_dir = "/scratch/users/rbharath/muv"
base_data_dir = "/home/apappu/deepchem/examples/nci/dataset"

nci_tasks, dataset, transformers = load_nci(
@@ -83,7 +82,6 @@ if os.path.exists(model_dir):
os.makedirs(model_dir)
def model_builder(tasks, task_types, model_params, model_dir, verbosity=None):
  return SklearnModel(tasks, task_types, model_params, model_dir,
                      #model_instance=LogisticRegression(class_weight="balanced"),
                      model_instance=RandomForestClassifier(
                          class_weight="balanced",
                          n_estimators=500),