Commit 824d1c4a authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Some cleanup in preparation for PR.

parent f5169457
Loading
Loading
Loading
Loading
+4 −1
Original line number Diff line number Diff line
@@ -193,7 +193,10 @@ def featurize_input(args):
      args.smiles_endpoint, args.id_endpoint, out_y_pkl, out_sdf)
  generate_features(df, args.feature_endpoints, args.smiles_endpoint,
                    args.id_endpoint, out_x_pkl)
  generate_fingerprints(df, args.name, args.out, args.smiles_endpoint, args.id_endpoint)
  generate_vs_utils_features(df, args.name, args.out, args.smiles_endpoint,
      args.id_endpoint, "fingerprints")
  generate_vs_utils_features(df, args.name, args.out, args.smiles_endpoint,
      args.id_endpoint, "descriptors")
  generate_descriptors(df, args.name, args.out, args.smiles_endpoint, args.id_endpoint)

def train_model(args):
+3 −3
Original line number Diff line number Diff line
@@ -46,7 +46,7 @@ def compute_model_performance(per_task_data, task_types, models, modeltype,
  return all_results, aucs, r2s, rms

def model_predictions(test_set, model, n_targets, task_types,
    modeltype="sklearn", datatype="vector"):
    modeltype="sklearn"):
  """Obtains predictions of provided model on test_set.

  Returns a list of per-task predictions.
@@ -98,7 +98,7 @@ def model_predictions(test_set, model, n_targets, task_types,
  return ypreds

  
def eval_model(test_set, model, task_types, modeltype="sklearn", datatype="vector"):
def eval_model(test_set, model, task_types, modeltype="sklearn"):
  """Evaluates the provided model on the test-set.

  Returns a dict which maps target-names to pairs of np.ndarrays (ytrue,
@@ -122,7 +122,7 @@ def eval_model(test_set, model, task_types, modeltype="sklearn", datatype="vecto
  local_task_types = task_types.copy()
  endpoints = sorted_targets
  ypreds = model_predictions(test_set, model, len(sorted_targets),
      local_task_types, modeltype=modeltype, datatype=datatype)
      local_task_types, modeltype=modeltype)
  results = {}
  for target in endpoints:
    results[target] = ([], [], [])  # (smiles, ytrue, yscore)
+20 −32
Original line number Diff line number Diff line
@@ -59,47 +59,38 @@ def parse_float_input(val):
    if ">" in val or "<" in val or "-" in val:
      return np.nan

def generate_fingerprints(df, name, out, smiles_endpoint, id_endpoint):
def generate_vs_util_features(df, name, out, smiles_endpoint, id_endpoint, featuretype):
  """Generates circular fingerprints for dataset."""
  dataset_dir = os.path.join(out, name)
  fingerprint_dir = os.path.join(dataset_dir, "fingerprints")
  fingerprints = os.path.join(fingerprint_dir,
      "%s-fingerprints.pkl.gz" % name)
  feature_dir = os.path.join(dataset_dir, "fingerprints")
  features = os.path.join(feature_dir,
      "%s-%s.pkl.gz" % (name, featuretype)

  fingerprint_df = pd.DataFrame([]) 
  fingerprint_df["smiles"] = df[[smiles_endpoint]]
  fingerprint_df["scaffolds"] = df[[smiles_endpoint]].apply(
  feature_df = pd.DataFrame([]) 
  feature_df["smiles"] = df[[smiles_endpoint]]
  feature_df["scaffolds"] = df[[smiles_endpoint]].apply(
    functools.partial(generate_scaffold, smiles_endpoint=smiles_endpoint),
    axis=1)
  fingerprint_df["mol_id"] = df[[id_endpoint]]
  feature_df["mol_id"] = df[[id_endpoint]]

  mols = []
  for row in df.iterrows():
    # pandas rows are tuples (row_num, row_data)
    smiles = row[1][smiles_endpoint]
    mols.append(Chem.MolFromSmiles(smiles))
  if featuretype == "fingerprints":
    featurizer = CircularFingerprint(size=1024)
  fingerprint_df["features"] = pd.DataFrame([ {"features": feature} for feature in featurizer.featurize(mols)])
  elif featurizer == "descriptors":
    featurizer = SimpleDescriptors()
  else:
    raise ValueError("Unsupported featuretype requested.")
  feature_df["features"] = pd.DataFrame(
      [{"features": feature} for feature in featurizer.featurize(mols)])

  with gzip.open(fingerprints, "wb") as f:
    pickle.dump(fingerprint_df, f, pickle.HIGHEST_PROTOCOL)
  #dataset_dir = os.path.join(out, name)
  #fingerprint_dir = os.path.join(dataset_dir, "fingerprints")
  #shards_dir = os.path.join(dataset_dir, "shards")
  #sdf = os.path.join(shards_dir, "%s-0.sdf.gz" % name)
  #fingerprints = os.path.join(fingerprint_dir,
  #    "%s-fingerprints.pkl.gz" % name)
  ## TODO(rbharath): There's a bit of ugliness here. featurize modifies the
  ## smiles strings internally, which I suspect leads to some non-matching
  ## smiles strings, hence dropping some compounds. featurize needs to be
  ## modified so that it can take in lists of smiles rather than just sdf file.
  ## FIXME: Make this directly call the CircularFingerprint featurizer in vs_utils.
  #subprocess.call(["python", "-m", "vs_utils.scripts.featurize",
  #                 "--scaffolds", "--smiles",
  #                 sdf, fingerprints,
  #                 "circular", "--size", "1024"])
  with gzip.open(features, "wb") as f:
    pickle.dump(feature_df, f, pickle.HIGHEST_PROTOCOL)

# TODO(rbharath): CODE SMELL! This and generate_fingerprints look almost identical. Factor out into a geneate_standard_featurization function.
'''
def generate_descriptors(df, name, out, smiles_endpoint, id_endpoint):
  """Generates molecular descriptors for dataset."""
  dataset_dir = os.path.join(out, name)
@@ -124,10 +115,7 @@ def generate_descriptors(df, name, out, smiles_endpoint, id_endpoint):

  with gzip.open(descriptors, "wb") as f:
    pickle.dump(descriptors_df, f, pickle.HIGHEST_PROTOCOL)
  #subprocess.call(["python", "-m", "vs_utils.scripts.featurize",
  #                 "--scaffolds", "--smiles",
  #                 sdf, descriptors,
  #                 "descriptors"])
'''

def get_rows(input_file, input_type, delimiter):
  """Returns an iterator over all rows in input_file"""
+0 −10
Original line number Diff line number Diff line
@@ -162,9 +162,6 @@ def load_assays(paths, target_dir_name="targets"):
              # Set all targets to invalid for now.
              labels[id][name] = -1
          labels[id][target_name] = measurement 
  print "load_assays()"
  print "labels"
  print labels
  return labels, splits

def load_datasets(paths, target_dir_name="targets", feature_types=["fingerprints"]):
@@ -180,12 +177,7 @@ def load_datasets(paths, target_dir_name="targets", feature_types=["fingerprints
  """
  data = {}
  molecules = load_molecules(paths, feature_types)
  print "load_datasets()"
  print "len(molecules)"
  print len(molecules)
  labels, splits = load_assays(paths, target_dir_name)
  print "len(labels)"
  print len(labels)
  for ind, id in enumerate(molecules):
    if id not in labels:
      continue
@@ -194,8 +186,6 @@ def load_datasets(paths, target_dir_name="targets", feature_types=["fingerprints
                "scaffold": mol["scaffold"],
                "labels": labels[id],
                "split": splits[id]}
  print "len(data)"
  print len(data)
  return data

def ensure_balanced(y, W):