Commit 6e65d9cf authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files
parents 3895d0fd 7bf3ea43
Loading
Loading
Loading
Loading
+11 −1
Original line number Diff line number Diff line
# deep-learning
deep_chem
=============

Deep Learning Toolchain for Cheminformatics and Protein Analysis

Requirements
------------
* [rdkit](http://www.rdkit.org/docs/Install.html)
* [sklearn](https://github.com/scikit-learn/scikit-learn.git)
* [numpy](https://store.continuum.io/cshop/anaconda/)
* [keras](keras.io)
+1 −1
Original line number Diff line number Diff line
@@ -28,7 +28,7 @@ def parse_args(input_args=None):
  """Parse command-line arguments."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--datasets', required=1, nargs="+",
                      choices=['muv', 'pcba', 'dude', 'pfizer'],
                      choices=['muv', 'pcba', 'dude', 'pfizer', 'globavir'],
                      help='Name of dataset to process.')
  parser.add_argument("--paths", required=1, nargs="+",
                      help = "Paths to input datasets.")
+119 −0
Original line number Diff line number Diff line
"""
Process an input dataset into a format suitable for machine learning.
"""
import os
import cPickle as pickle
import gzip
import pandas as pd
import openpyxl as px
import numpy as np
import argparse
from rdkit import Chem
import subprocess
from vs_utils.utils import SmilesGenerator

def parse_args(input_args=None):
  """Parse command-line arguments."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--xlsx', required=1,
                      help='Excel file with Globavir data.')
  parser.add_argument("--name", required=1,
                      help="Name of the dataset.")
  parser.add_argument("--out", required=1,
                      help="Folder to generate processed dataset in.")
  return parser.parse_args(input_args)

def generate_directories(name, out):
  """Generate processed dataset."""
  dataset_dir = os.path.join(out, name)
  if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)
  fingerprint_dir = os.path.join(dataset_dir, "circular-scaffold-smiles")
  if not os.path.exists(fingerprint_dir):
    os.makedirs(fingerprint_dir)
  target_dir = os.path.join(dataset_dir, "targets")
  if not os.path.exists(target_dir):
    os.makedirs(target_dir)
  shards_dir = os.path.join(dataset_dir, "shards")
  if not os.path.exists(shards_dir):
    os.makedirs(shards_dir)

  # Return names of files to be generated
  out_pkl = os.path.join(target_dir, "%s.pkl.gz" % name)
  out_sdf = os.path.join(shards_dir, "%s-0.sdf.gz" % name)
  return out_pkl, out_sdf

def parse_float_input(val):
  """Safely parses a float input."""
  # TODO(rbharath): Correctly parse float ranges.
  try:
    if val is None:
      return val
    else:
      fval = float(val)
      return fval
  except ValueError:
    if ">" in val or "<" in val or "-" in val:
      return np.nan

def generate_fingerprints(name, out):
  dataset_dir = os.path.join(out, name)
  fingerprint_dir = os.path.join(dataset_dir, "circular-scaffold-smiles")
  shards_dir = os.path.join(dataset_dir, "shards")
  sdf = os.path.join(shards_dir, "%s-0.sdf.gz" % name)
  fingerprints = os.path.join(fingerprint_dir,
      "%s-circular-scaffolds-smiles.pkl.gz" % name)
  subprocess.call(["python", "-m", "vs_utils.scripts.featurize",
                   "--scaffolds", "--smiles",
                   sdf, fingerprints,
                   "circular", "--size", "1024"])


def generate_targets(xlsx_file, out_pkl, out_sdf):
  """Process Globavir xlsx file."""
  rows, mols = [], []
  W = px.load_workbook(xlsx_file, use_iterators=True)
  p = W.get_sheet_by_name(name="Sheet1")
  smiles = SmilesGenerator()
  for row_index, row in enumerate(p.iter_rows()):
    # Skip row labels.
    if row_index == 0:
      continue
    row_data = [cell.internal_value for cell in row]
    # TODO(rbharath): Generalize this code to work for non-Globavir data. 
    row = {
      "compound_name": row_data[0],
      "isomeric_smiles": row_data[1],
      "tdo_ic50_nm": parse_float_input(row_data[5]),
      "tdo_Ki_nm": parse_float_input(row_data[6]),
      "tdo_percent_activity_10_um": parse_float_input(row_data[7]),
      "tdo_percent_activity_1_um": parse_float_input(row_data[8]),
      "ido_ic50_nm": parse_float_input(row_data[9]),
      "ido_Ki_nm": parse_float_input(row_data[10]),
      "ido_percent_activity_10_um": parse_float_input(row_data[11]),
      "ido_percent_activity_1_um": parse_float_input(row_data[12])
    }
    mol = Chem.MolFromSmiles(row["isomeric_smiles"])
    row["smiles"] = smiles.get_smiles(mol)
    mols.append(mol)
    rows.append(row)
  df = pd.DataFrame(rows)
  # Write pkl.gz file
  with gzip.open(out_pkl, "wb") as f:
    pickle.dump(df, f, pickle.HIGHEST_PROTOCOL)
  # Write sdf.gz file
  with gzip.open(out_sdf, "wb") as gz:
    w = Chem.SDWriter(gz)
    for mol in mols:
      w.write(mol)
    w.close()

def main():
  args = parse_args()
  out_pkl, out_sdf = generate_directories(args.name, args.out)
  generate_targets(args.xlsx, out_pkl, out_sdf)
  generate_fingerprints(args.name, args.out)


if __name__ == "__main__":
  main()
+20 −11
Original line number Diff line number Diff line
@@ -12,7 +12,7 @@ from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score

def model_predictions(test_set, model, n_targets, n_descriptors=0,
def model_predictions(test_set, model, n_targets, task_types, n_descriptors=0,
    add_descriptors=False, modeltype="sklearn"):
  """Obtains predictions of provided model on test_set.

@@ -29,6 +29,9 @@ def model_predictions(test_set, model, n_targets, n_descriptors=0,
    A trained scikit-learn or keras model.
  n_targets: int
    Number of output targets
  task_types: dict 
    dict mapping target names to output type. Each output type must be either
    "classification" or "regression".
  n_descriptors: int
    Number of output descriptors
  modeltype: string
@@ -42,15 +45,20 @@ def model_predictions(test_set, model, n_targets, n_descriptors=0,
    n_outputs = n_targets + n_descriptors
  else:
    n_outputs = n_targets
  if modeltype == "sklearn":
    ypreds = model.predict_proba(X)
  elif modeltype == "keras":
    ypreds = model.predict(X)
  elif modeltype == "keras_multitask":
  if modeltype == "keras_multitask":
    predictions = model.predict({"input": X})
    ypreds = []
    for index in range(n_outputs):
      ypreds.append(predictions["task%d" % index])
  elif modeltype == "sklearn":
    # Must be single-task (breaking multitask RFs here)
    task_type = task_types.itervalues().next()
    if task_type == "classification":
      ypreds = model.predict_proba(X)
    elif task_type == "regression":
      ypreds = model.predict(X)
  elif modeltype == "keras":
    ypreds = model.predict(X)
  else:
    raise ValueError("Improper modeltype.")
  # Handle the edge case for singletask. 
@@ -97,19 +105,20 @@ def eval_model(test_set, model, task_types, desc_transforms={}, modeltype="sklea
    local_task_types = task_types.copy()
    endpoints = sorted_targets
  ypreds = model_predictions(test_set, model, len(sorted_targets),
      n_descriptors=len(desc_transforms), modeltype=modeltype,
      add_descriptors=add_descriptors)
      local_task_types, n_descriptors=len(desc_transforms),
      modeltype=modeltype, add_descriptors=add_descriptors)
  results = {}
  for target in endpoints:
    results[target] = ([], [])  # (ytrue, yscore)
  # Iterate through test set data points.
  sorted_smiles = sorted(test_set.keys())
  for index, smiles in enumerate(sorted_smiles):
  for index, smiles in enumerate(sorted(test_set.keys())):
    datapoint = test_set[smiles]
    labels = datapoint["labels"]
    for t_ind, target in enumerate(endpoints):
      task_type = local_task_types[target]
      if target in sorted_targets and labels[target] == -1:
      if (task_type == "classification"
        and target in sorted_targets
        and labels[target] == -1):
        continue
      else:
        ytrue, yscore = results[target]
+18 −9
Original line number Diff line number Diff line
@@ -30,6 +30,10 @@ def get_default_task_types_and_transforms(dataset_specs):
      for target in targets:
        task_types[target] = "regression"
        task_transforms[target] = ["log", "normalize"]
    elif name == "globavir":
      for target in targets:
        task_types[target] = "regression"
        task_transforms[target] = ["normalize"]
    elif name == "pdbbind":
      raise ValueError("pdbbind not yet supported!")
  return task_types, task_transforms
@@ -77,7 +81,10 @@ def load_molecules(paths, dir_name="circular-scaffold-smiles"):
  molecules = {}
  for dataset_path in paths:
    pickle_dir = os.path.join(dataset_path, dir_name)
    for pickle_file in os.listdir(pickle_dir):
    pickle_files = os.listdir(pickle_dir)
    if len(pickle_files) == 0:
      raise ValueError("No Pickle Files found to load molecules")
    for pickle_file in pickle_files:
      with gzip.open(os.path.join(pickle_dir, pickle_file), "rb") as f:
        contents = pickle.load(f)
        smiles, fingerprints, scaffolds, mol_ids = (
@@ -134,25 +141,28 @@ def load_assays(paths, target_dir_name="targets"):
          items = zip(contents["smiles"], contents["potency"])
        elif "targets" in contents:
          items = zip(contents["smiles"], contents["targets"])
        # TODO(rbharath): Remove this horrible special purpose code.
        elif "tdo_percent_activity_10_um" in contents:
          items = zip(contents["smiles"], contents["tdo_percent_activity_10_um"])
        else:
          raise ValueError("Must contain either potency or targets field.")
        for mol, potency in items:
          raise ValueError("Must contain recognized measurement.")
        for smiles, measurement in items:
          # TODO(rbharath): Get a less kludgey answer
          # TODO(rbharath): There is some amount of duplicate collisions
          # due to choice of smiles generation. Look into this more
          # carefully and see if the underlying issues are fundamental..
          try:
            if potency is None or np.isnan(potency):
            if measurement is None or np.isnan(measurement):
              continue
          except TypeError:
            continue
          if mol not in datapoints:
            datapoints[mol] = {}
          if smiles not in datapoints:
            datapoints[smiles] = {}
            # Ensure that each target has some entry in dict.
            for name in target_names:
              # Set all targets to invalid for now.
              datapoints[mol][name] = -1
          datapoints[mol][target_name] = potency 
              datapoints[smiles][name] = -1
          datapoints[smiles][target_name] = measurement 
  return datapoints

def load_datasets(paths, datatype="vs", **load_args):
@@ -184,7 +194,6 @@ def load_pdbbind_datasets(pdbbind_paths):
          "features": row[1],
        })
  df = pd.DataFrame(data)
  print df.shape
  return df

def load_vs_datasets(paths, target_dir_name="targets",
Loading