Commit c8518e6e authored by Bharath's avatar Bharath
Browse files

Potential speedups

parent 7bdc2a80
Loading
Loading
Loading
Loading
+52 −42
Original line number Diff line number Diff line
@@ -14,6 +14,7 @@ from deepchem.utils.save import save_to_disk
from deepchem.utils.save import load_from_disk
from deepchem.utils.save import log
####################################################### DEBUG
import time
import sys
####################################################### DEBUG

@@ -73,14 +74,21 @@ class Dataset(object):
      if not len(df):
        return None
      ############################################################## DEBUG
      print("About to call convert_df_to_numpy")
      print("mol_id_field")
      print(mol_id_field)
      #print("About to call convert_df_to_numpy")
      #print("mol_id_field")
      #print(mol_id_field)
      ############################################################## DEBUG
      ############################################################## DEBUG
      time1 = time.time()
      ############################################################## DEBUG
      ids, X, y, w = convert_df_to_numpy(df, feature_type, tasks, mol_id_field)
      ############################################################## DEBUG
      print("convert_df_to_numpy returned successfully")
      sys.stdout.flush()
      time2 = time.time()
      print("CONVERT_DF_TO_NUMPY TOOK %0.3f s" % (time2-time1))
      ############################################################## DEBUG
      ############################################################## DEBUG
      #print("convert_df_to_numpy returned successfully")
      #sys.stdout.flush()
      ############################################################## DEBUG
    else:
      ids, X, y, w = raw_data
@@ -602,71 +610,73 @@ def compute_sums_and_nb_sample(tensor, W=None):
# make it easy to use multiprocessing.
def convert_df_to_numpy(df, feature_type, tasks, mol_id_field):
  """Transforms a featurized dataset df into standard set of numpy arrays"""
  ############################################################## DEBUG
  print("SLKFJD:LSKJF:SLFKJ:SLDFKJSD:LKFJDSLKFJSDKFJSLKFJS:LFJSDLKJ")
  ############################################################## DEBUG
  if feature_type not in df.keys():
    ############################################################## DEBUG
    print("SLKFJD:LSKJF:SLFKJ:SLDFKJSD:LKFJDSLKFJSDKFJSLKFJS:LFJSDLKJ")
    print("feature_type")
    print(feature_type)
    sys.stdout.flush()
    ############################################################## DEBUG

    raise ValueError(
        "Featurized data does not support requested feature_type %s." % feature_type)
  # perform common train/test split across all tasks
  n_samples = df.shape[0]
  n_tasks = len(tasks)
  ############################################################## DEBUG
  time1 = time.time()
  ############################################################## DEBUG
  y = np.hstack([
      np.reshape(np.array(df[task].values), (n_samples, 1)) for task in tasks])
  ############################################################## DEBUG
  time2 = time.time()
  print("CONVERT_DF_TO_NUMPY Y COMP TOOK %0.3f s" % (time2-time1))
  ############################################################## DEBUG
  w = np.ones((n_samples, n_tasks))
  missing = np.zeros_like(y).astype(int)
  all_features = []
  #all_features = []
  feature_shape = None
  ############################################################## DEBUG
  print("convert_df_to_numpy --- about to loop through data.")
  time1 = time.time()
  ############################################################## DEBUG
  for ind in range(n_samples):
    ############################################################### DEBUG
    #print("sample %d" % ind)
    ############################################################### DEBUG
    datapoint = df.iloc[ind]
    features = np.squeeze(datapoint[feature_type])
    ############################################################### DEBUG
    #print("features.size")
    ############################################################### DEBUG
    if features.size == 0:
      features = np.zeros(feature_shape)
      all_features.append(features)
      missing[ind, :] = 1
      continue
    if feature_shape is None:
      feature_shape = features.shape
    #datapoint = df.iloc[ind]
    #features = np.squeeze(datapoint[feature_type])
    #if features.size == 0:
    #  features = np.zeros(feature_shape)
    #  all_features.append(features)
    #  missing[ind, :] = 1
    #  continue
    #if feature_shape is None:
    #  feature_shape = features.shape
    for task in range(n_tasks):
      if y[ind, task] == "":
        missing[ind, task] = 1
    if features.shape != feature_shape:
      missing[ind, :] = 1
      continue
    ############################################################### DEBUG
    #print("Done processing sample")
    ############################################################### DEBUG
    all_features.append(features)
  x = np.stack(all_features)
    #if features.shape != feature_shape:
    #  missing[ind, :] = 1
    #  continue
    #all_features.append(features)
  #x_orig = np.stack(all_features)
  #x = df.as_matrix(columns=[feature_type])
  x = np.array(list(df[feature_type].values))
  #print("x.shape")
  #print(x.shape)
  #print(x.shape, x_orig.shape)
  #print("type(x)")
  #print(type(x))
  ############################################################## DEBUG
  print("mol_id_field")
  print(mol_id_field)
  time2 = time.time()
  print("CONVERT_DF_TO_NUMPY X COMP TOOK %0.3f s" % (time2-time1))
  ############################################################## DEBUG
  sorted_ids = df[mol_id_field]

  # Set missing data to have weight zero
  # TODO(rbharath): There's a better way to do this with numpy indexing
  ############################################################## DEBUG
  time1 = time.time()
  ############################################################## DEBUG
  for ind in range(n_samples):
    for task in range(n_tasks):
      if missing[ind, task]:
        y[ind, task] = 0.
        w[ind, task] = 0.
  ############################################################## DEBUG
  time2 = time.time()
  print("CONVERT_DF_TO_NUMPY MISSING COMP TOOK %0.3f s" % (time2-time1))
  ############################################################## DEBUG

  # Adding this assertion in to avoid ill-formed outputs.
  assert len(sorted_ids) == len(x) == len(y) == len(w)
+5 −5
Original line number Diff line number Diff line
@@ -68,18 +68,18 @@ def load_bace(mode="regression", transform=True, split="20-80"):
    bace_tasks = ["Class"]
  else:
    raise ValueError("Unknown mode %s" % mode)
  featurizers = [UserDefinedFeaturizer(user_specified_features)]
  featurizer = DataFeaturizer(tasks=bace_tasks,
  featurizer = UserDefinedFeaturizer(user_specified_features)
  loader = DataFeaturizer(tasks=bace_tasks,
                              smiles_field="mol",
                              id_field="CID",
                              featurizers=featurizers)
                              featurizer=featurizer)
  if not reload or not os.path.exists(data_dir):
    dataset = featurizer.featurize(dataset_file, data_dir)
    dataset = loader.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)
  if not reload or not os.path.exists(crystal_dir):
    crystal_dataset = featurizer.featurize(crystal_dataset_file, crystal_dir)
    crystal_dataset = loader.featurize(crystal_dataset_file, crystal_dir)
  else:
    crystal_dataset = Dataset(crystal_dir, reload=True)

+6 −6
Original line number Diff line number Diff line
@@ -43,18 +43,18 @@ def load_muv(base_dir, reload=True):

  # Featurize MUV dataset
  print("About to featurize MUV dataset.")
  featurizers = [CircularFingerprint(size=1024)]
  featurizer = CircularFingerprint(size=1024)
  all_MUV_tasks = sorted(['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644',
                          'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712',
                          'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652',
                          'MUV-466', 'MUV-832'])

  featurizer = DataFeaturizer(tasks=all_MUV_tasks,
  loader = DataFeaturizer(tasks=all_MUV_tasks,
                          smiles_field="smiles",
                              featurizers=featurizers,
                          featurizer=featurizer,
                          verbosity=verbosity)
  if not reload or not os.path.exists(data_dir):
    dataset = featurizer.featurize(dataset_file, data_dir)
    dataset = loader.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)
+6 −6
Original line number Diff line number Diff line
@@ -50,7 +50,7 @@ def load_nci(base_dir, reload=True, force_transform=False):

  # Featurize nci dataset
  print("About to featurize nci dataset.")
  featurizers = [CircularFingerprint(size=1024)]
  featurizer = CircularFingerprint(size=1024)
  #was sorted list originally in muv_datasets.py, but csv is ordered so removed
  all_nci_tasks = (['CCRF-CEM', 'HL-60(TB)', 'K-562', 'MOLT-4', 'RPMI-8226',
                    'SR', 'A549/ATCC', 'EKVX', 'HOP-62', 'HOP-92', 'NCI-H226',
@@ -65,12 +65,12 @@ def load_nci(base_dir, reload=True, force_transform=False):
                    'MDA-MB-231/ATCC', 'MDA-MB-468', 'HS 578T', 'BT-549',
                    'T-47D'])

  featurizer = DataFeaturizer(tasks=all_nci_tasks,
  loader = DataFeaturizer(tasks=all_nci_tasks,
                          smiles_field="smiles",
                              featurizers=featurizers,
                          featurizer=featurizer,
                          verbosity=verbosity)
  if not reload or not os.path.exists(data_dir):
    dataset = featurizer.featurize(dataset_paths, data_dir)
    dataset = loader.featurize(dataset_paths, data_dir)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)
+6 −6
Original line number Diff line number Diff line
@@ -42,7 +42,7 @@ def load_pcba(base_dir, reload=True):

  # Featurize PCBA dataset
  print("About to featurize PCBA dataset.")
  featurizers = [CircularFingerprint(size=1024)]
  featurizer = CircularFingerprint(size=1024)
  all_PCBA_tasks = [
      'PCBA-1030','PCBA-1379','PCBA-1452','PCBA-1454','PCBA-1457',
      'PCBA-1458','PCBA-1460','PCBA-1461','PCBA-1468','PCBA-1469',
@@ -70,12 +70,12 @@ def load_pcba(base_dir, reload=True):
      'PCBA-902','PCBA-903','PCBA-904','PCBA-912','PCBA-914','PCBA-915',
      'PCBA-924','PCBA-925','PCBA-926','PCBA-927','PCBA-938','PCBA-995']

  featurizer = DataFeaturizer(tasks=all_PCBA_tasks,
  loader = DataFeaturizer(tasks=all_PCBA_tasks,
                          smiles_field="smiles",
                              featurizers=featurizers,
                          featurizer=featurizer,
                          verbosity=verbosity)
  if not reload or not os.path.exists(data_dir):
    dataset = featurizer.featurize(dataset_file, data_dir)
    dataset = loader.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)
Loading