Commit ee5cccc0 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Cleanup.

parent a247f348
Loading
Loading
Loading
Loading
+0 −5
Original line number Diff line number Diff line
@@ -28,13 +28,8 @@ def fit_singletask_models(train_data, modeltype):
    None or "log". Only for regression outputs.
  """
  models = {}
  print "fit_singletask_models()"
  print "train_data.keys()"
  print train_data.keys()
  import numpy as np
  X_train = train_data["features"]
  print "np.shape(X_train)"
  print np.shape(X_train)
  sorted_tasks = train_data["sorted_tasks"]
  for task in sorted_tasks:
    print "Building model for task %s" % task
+1 −4
Original line number Diff line number Diff line
@@ -63,7 +63,6 @@ def generate_vs_utils_features(dataframe, name, out, smiles_field, id_field,
  feature_dir = os.path.join(dataset_dir, featuretype)
  features_file = os.path.join(feature_dir, "%s-%s.pkl.gz" % (name, featuretype))

  print("About to instantiate featurizer.")
  if featuretype == "fingerprints":
    featurizer = CircularFingerprint(size=1024)
  elif featuretype == "descriptors":
@@ -78,18 +77,16 @@ def generate_vs_utils_features(dataframe, name, out, smiles_field, id_field,
      print("Featurizing molecule %d" % row_ind)
    mol = Chem.MolFromSmiles(row_data)
    features.append(featurizer.featurize([mol]))
  print("Done generating features. About to transfer them to dataframe.")

  feature_df = pd.DataFrame([])
  feature_df["features"] = pd.DataFrame(
      [{"features": feature} for feature in features])

  print("Done transfering to dataframe. About to populate remaining df fields.")
  feature_df["smiles"] = dataframe[[smiles_field]]
  feature_df["scaffolds"] = dataframe[[smiles_field]].apply(
      functools.partial(generate_scaffold, smiles_field=smiles_field),
      axis=1)
  feature_df["mol_id"] = dataframe[[id_field]]
  print("Populated 'smiles', 'scaffolds', 'mol_id' fields")

  print("About to write pkl.gz file")
  with gzip.open(features_file, "wb") as gzip_file:
+4 −4
Original line number Diff line number Diff line
@@ -22,12 +22,12 @@ def process_datasets(paths, feature_types=None, mode="multitask",
                     splittype="random", target_names=None):
  """Extracts datasets and split into train/test.

  Returns a dict with the following keys
  Returns a dict with the following key/value pairs

  "features" -> X
  "mol_ids"  -> ids
  features -> X
  mol_ids  -> ids
  target -> (y, W)
  "sorted_targets" -> sorted_targets
  sorted_targets -> sorted_targets

  Parameters
  ----------
+0 −26
Original line number Diff line number Diff line
@@ -345,29 +345,3 @@ def scaffold_separate(dataset):
      scaffolds[scaffold].append(mol_id)
  # Sort from largest to smallest scaffold sets
  return [elt for (scaffold, elt) in sorted(scaffolds.items(), key=lambda x: -len(x[1]))]

#def labels_to_weights(ytrue):
#  """Uses the true labels to compute and output sample weights.
#
#  Parameters
#  ----------
#  ytrue: list or np.ndarray
#    True labels.
#  """
#  n_total = np.shape(ytrue)[0]
#  n_positives = np.sum(ytrue)
#  n_negatives = n_total - n_positives
#  pos_weight = np.floor(n_negatives/n_positives)
#
#  sample_weights = np.zeros(np.shape(ytrue)[0])
#  for ind, entry in enumerate(ytrue):
#    if entry == 0:  # negative
#      sample_weights[ind] = 1
#    elif entry == 1:  # positive
#      sample_weights[ind] = pos_weight
#    else:
#      print("labels_to_weights()")
#      print("ytrue")
#      print(ytrue)
#      raise ValueError("ytrue can only contain 0s or 1s.")
#  return sample_weights