Commit 69931760 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Some fixes to NCI datasets/loading

parent aa0403c9
Loading
Loading
Loading
Loading
−47 KiB (51.6 MiB)

File changed.

No diff preview for this file type.

−46.7 KiB (51.5 MiB)

File changed.

No diff preview for this file type.

+4 −4
Original line number Diff line number Diff line
@@ -20,7 +20,6 @@ from deepchem.transformers import BalancingTransformer
def load_nci(base_dir, reload=True):
  """Load NCI datasets. Does not do train/test split"""
  # Set some global variables up top
  #reload = True
  verbosity = "high"
  model = "logistic"
  regen = False
@@ -29,7 +28,7 @@ def load_nci(base_dir, reload=True):
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      print("deleting dir in datasets.py")
      print("Deleting dir in nci_datasets.py")
      print(base_dir)
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
@@ -63,14 +62,15 @@ def load_nci(base_dir, reload=True):
                    'OVCAR-3', 'OVCAR-4', 'OVCAR-5', 'OVCAR-8', 'NCI/ADR-RES',
                    'SK-OV-3', '786-0', 'A498', 'ACHN', 'CAKI-1', 'RXF 393',
                    'SN12C', 'TK-10', 'UO-31', 'PC-3', 'DU-145', 'MCF7',
                    'MDA-MB-231/ATCC', 'MDA-MB-468', 'HS 578T', 'MDA-N', 'BT-549'])
                    'MDA-MB-231/ATCC', 'MDA-MB-468', 'HS 578T', 'BT-549',
                    'T-47D'])

  featurizer = DataFeaturizer(tasks=all_nci_tasks,
                              smiles_field="smiles",
                              featurizers=featurizers,
                              verbosity=verbosity)
  if not reload or not os.path.exists(data_dir):
    dataset = featurizer.featurize(dataset_file, data_dir)
    dataset = featurizer.featurize(dataset_paths, data_dir)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)
+4 −0
Original line number Diff line number Diff line
@@ -80,6 +80,10 @@ class DataFeaturizer(object):
    """Featurize provided files and write to specified location."""
    log("Loading raw samples now.", self.verbosity)

    # Allow users to specify a single file for featurization
    if not isinstance(input_files, list):
      input_files = [input_files]

    if not os.path.exists(data_dir):
      os.makedirs(data_dir)

+1 −1
Original line number Diff line number Diff line
@@ -77,7 +77,7 @@ def load_sdf_files(input_files):
    dataframes.append(pd.concat([mol_df, raw_df], axis=1, join='inner'))
  return dataframes

def load_csv_file(filenames, shard_size=None):
def load_csv_files(filenames, shard_size=None):
  """Load data as pandas dataframe."""
  # First line of user-specified CSV *must* be header.
  for filename in filenames:
Loading