Commit 7bf3ea43 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Changes required to run RFs on globavir data.

parent 6eeb68a0
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -28,7 +28,7 @@ def parse_args(input_args=None):
  """Parse command-line arguments."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--datasets', required=1, nargs="+",
                      choices=['muv', 'pcba', 'dude', 'pfizer'],
                      choices=['muv', 'pcba', 'dude', 'pfizer', 'globavir'],
                      help='Name of dataset to process.')
  parser.add_argument("--paths", required=1, nargs="+",
                      help = "Paths to input datasets.")
+24 −4
Original line number Diff line number Diff line
"""
Train a variety of machine learning models on biochem datasets.
Process an input dataset into a format suitable for machine learning.
"""
import os
import cPickle as pickle
@@ -9,6 +9,8 @@ import openpyxl as px
import numpy as np
import argparse
from rdkit import Chem
import subprocess
from vs_utils.utils import SmilesGenerator

def parse_args(input_args=None):
  """Parse command-line arguments."""
@@ -54,16 +56,31 @@ def parse_float_input(val):
    if ">" in val or "<" in val or "-" in val:
      return np.nan

def process_globavir(xlsx_file, out_pkl, out_sdf):
def generate_fingerprints(name, out):
  dataset_dir = os.path.join(out, name)
  fingerprint_dir = os.path.join(dataset_dir, "circular-scaffold-smiles")
  shards_dir = os.path.join(dataset_dir, "shards")
  sdf = os.path.join(shards_dir, "%s-0.sdf.gz" % name)
  fingerprints = os.path.join(fingerprint_dir,
      "%s-circular-scaffolds-smiles.pkl.gz" % name)
  subprocess.call(["python", "-m", "vs_utils.scripts.featurize",
                   "--scaffolds", "--smiles",
                   sdf, fingerprints,
                   "circular", "--size", "1024"])


def generate_targets(xlsx_file, out_pkl, out_sdf):
  """Process Globavir xlsx file."""
  rows, mols = [], []
  W = px.load_workbook(xlsx_file, use_iterators=True)
  p = W.get_sheet_by_name(name="Sheet1")
  smiles = SmilesGenerator()
  for row_index, row in enumerate(p.iter_rows()):
    # Skip row labels.
    if row_index == 0:
      continue
    row_data = [cell.internal_value for cell in row]
    # TODO(rbharath): Generalize this code to work for non-Globavir data. 
    row = {
      "compound_name": row_data[0],
      "isomeric_smiles": row_data[1],
@@ -76,7 +93,9 @@ def process_globavir(xlsx_file, out_pkl, out_sdf):
      "ido_percent_activity_10_um": parse_float_input(row_data[11]),
      "ido_percent_activity_1_um": parse_float_input(row_data[12])
    }
    mols.append(Chem.MolFromSmiles(row["isomeric_smiles"]))
    mol = Chem.MolFromSmiles(row["isomeric_smiles"])
    row["smiles"] = smiles.get_smiles(mol)
    mols.append(mol)
    rows.append(row)
  df = pd.DataFrame(rows)
  # Write pkl.gz file
@@ -92,7 +111,8 @@ def process_globavir(xlsx_file, out_pkl, out_sdf):
def main():
  args = parse_args()
  out_pkl, out_sdf = generate_directories(args.name, args.out)
  process_globavir(args.xlsx, out_pkl, out_sdf)
  generate_targets(args.xlsx, out_pkl, out_sdf)
  generate_fingerprints(args.name, args.out)


if __name__ == "__main__":
+20 −11
Original line number Diff line number Diff line
@@ -12,7 +12,7 @@ from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score

def model_predictions(test_set, model, n_targets, n_descriptors=0,
def model_predictions(test_set, model, n_targets, task_types, n_descriptors=0,
    add_descriptors=False, modeltype="sklearn"):
  """Obtains predictions of provided model on test_set.

@@ -29,6 +29,9 @@ def model_predictions(test_set, model, n_targets, n_descriptors=0,
    A trained scikit-learn or keras model.
  n_targets: int
    Number of output targets
  task_types: dict 
    dict mapping target names to output type. Each output type must be either
    "classification" or "regression".
  n_descriptors: int
    Number of output descriptors
  modeltype: string
@@ -42,15 +45,20 @@ def model_predictions(test_set, model, n_targets, n_descriptors=0,
    n_outputs = n_targets + n_descriptors
  else:
    n_outputs = n_targets
  if modeltype == "sklearn":
    ypreds = model.predict_proba(X)
  elif modeltype == "keras":
    ypreds = model.predict(X)
  elif modeltype == "keras_multitask":
  if modeltype == "keras_multitask":
    predictions = model.predict({"input": X})
    ypreds = []
    for index in range(n_outputs):
      ypreds.append(predictions["task%d" % index])
  elif modeltype == "sklearn":
    # Must be single-task (breaking multitask RFs here)
    task_type = task_types.itervalues().next()
    if task_type == "classification":
      ypreds = model.predict_proba(X)
    elif task_type == "regression":
      ypreds = model.predict(X)
  elif modeltype == "keras":
    ypreds = model.predict(X)
  else:
    raise ValueError("Improper modeltype.")
  # Handle the edge case for singletask. 
@@ -97,19 +105,20 @@ def eval_model(test_set, model, task_types, desc_transforms={}, modeltype="sklea
    local_task_types = task_types.copy()
    endpoints = sorted_targets
  ypreds = model_predictions(test_set, model, len(sorted_targets),
      n_descriptors=len(desc_transforms), modeltype=modeltype,
      add_descriptors=add_descriptors)
      local_task_types, n_descriptors=len(desc_transforms),
      modeltype=modeltype, add_descriptors=add_descriptors)
  results = {}
  for target in endpoints:
    results[target] = ([], [])  # (ytrue, yscore)
  # Iterate through test set data points.
  sorted_smiles = sorted(test_set.keys())
  for index, smiles in enumerate(sorted_smiles):
  for index, smiles in enumerate(sorted(test_set.keys())):
    datapoint = test_set[smiles]
    labels = datapoint["labels"]
    for t_ind, target in enumerate(endpoints):
      task_type = local_task_types[target]
      if target in sorted_targets and labels[target] == -1:
      if (task_type == "classification"
        and target in sorted_targets
        and labels[target] == -1):
        continue
      else:
        ytrue, yscore = results[target]
+18 −9
Original line number Diff line number Diff line
@@ -30,6 +30,10 @@ def get_default_task_types_and_transforms(dataset_specs):
      for target in targets:
        task_types[target] = "regression"
        task_transforms[target] = ["log", "normalize"]
    elif name == "globavir":
      for target in targets:
        task_types[target] = "regression"
        task_transforms[target] = ["normalize"]
    elif name == "pdbbind":
      raise ValueError("pdbbind not yet supported!")
  return task_types, task_transforms
@@ -77,7 +81,10 @@ def load_molecules(paths, dir_name="circular-scaffold-smiles"):
  molecules = {}
  for dataset_path in paths:
    pickle_dir = os.path.join(dataset_path, dir_name)
    for pickle_file in os.listdir(pickle_dir):
    pickle_files = os.listdir(pickle_dir)
    if len(pickle_files) == 0:
      raise ValueError("No Pickle Files found to load molecules")
    for pickle_file in pickle_files:
      with gzip.open(os.path.join(pickle_dir, pickle_file), "rb") as f:
        contents = pickle.load(f)
        smiles, fingerprints, scaffolds, mol_ids = (
@@ -134,25 +141,28 @@ def load_assays(paths, target_dir_name="targets"):
          items = zip(contents["smiles"], contents["potency"])
        elif "targets" in contents:
          items = zip(contents["smiles"], contents["targets"])
        # TODO(rbharath): Remove this horrible special purpose code.
        elif "tdo_percent_activity_10_um" in contents:
          items = zip(contents["smiles"], contents["tdo_percent_activity_10_um"])
        else:
          raise ValueError("Must contain either potency or targets field.")
        for mol, potency in items:
          raise ValueError("Must contain recognized measurement.")
        for smiles, measurement in items:
          # TODO(rbharath): Get a less kludgey answer
          # TODO(rbharath): There is some amount of duplicate collisions
          # due to choice of smiles generation. Look into this more
          # carefully and see if the underlying issues are fundamental..
          try:
            if potency is None or np.isnan(potency):
            if measurement is None or np.isnan(measurement):
              continue
          except TypeError:
            continue
          if mol not in datapoints:
            datapoints[mol] = {}
          if smiles not in datapoints:
            datapoints[smiles] = {}
            # Ensure that each target has some entry in dict.
            for name in target_names:
              # Set all targets to invalid for now.
              datapoints[mol][name] = -1
          datapoints[mol][target_name] = potency 
              datapoints[smiles][name] = -1
          datapoints[smiles][target_name] = measurement 
  return datapoints

def load_datasets(paths, datatype="vs", **load_args):
@@ -184,7 +194,6 @@ def load_pdbbind_datasets(pdbbind_paths):
          "features": row[1],
        })
  df = pd.DataFrame(data)
  print df.shape
  return df

def load_vs_datasets(paths, target_dir_name="targets",
+0 −8
Original line number Diff line number Diff line
@@ -48,12 +48,6 @@ def transform_outputs(dataset, task_transforms, desc_transforms={},
    transforms.update(desc_transforms)
  for task, target in enumerate(endpoints):
    task_transforms = transforms[target]
    print "Task %d has NaNs?" % task
    print np.any(np.isnan(y[:, task]))
    print "Task %d data" % task
    print y[:, task]
    print "Task %d distribution" % task
    summarize_distribution(y[:, task])
    for task_transform in task_transforms:
      if task_transform == "log":
        y[:, task] = np.log(y[:, task])
@@ -78,8 +72,6 @@ def transform_outputs(dataset, task_transforms, desc_transforms={},
        task_data[nonzero] = task_data[nonzero] / std
      else:
        raise ValueError("Task tranform must be 1+max-val, log, or normalize")
    print "Post-transform task %d distribution" % task
    summarize_distribution(y[:, task])
  return X, y, W

def to_one_hot(y):