Commit 57aef2d7 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Deep Docking Changes.

parent 0336e0d8
Loading
Loading
Loading
Loading
+22 −19
Original line number Diff line number Diff line
@@ -16,6 +16,7 @@ from deep_chem.utils.preprocess import train_test_scaffold_split
from deep_chem.utils.preprocess import dataset_to_numpy
from deep_chem.utils.preprocess import to_one_hot
from deep_chem.utils.evaluate import eval_model
from deep_chem.utils.evaluate import size_eval_model
from deep_chem.utils.evaluate import compute_r2_scores
from deep_chem.utils.evaluate import compute_rms_scores
from deep_chem.utils.evaluate import compute_roc_auc_scores
@@ -168,26 +169,28 @@ def fit_singletask_mlp(paths, task_types, task_transforms,
        singletasks[target])
    model = train_multitask_model(X_train, y_train, W_train,
        {target: task_types[target]}, **training_params)
    results = eval_model(test, model, {target: task_types[target]}, 
                         # We run singletask models as special cases of
                         # multitask.
    #results = eval_model(test, model, {target: task_types[target]}, 
    #                     # We run singletask models as special cases of
    #                     # multitask.
    #                     modeltype="keras_multitask")
    results = size_eval_model(test, model, {target: task_types[target]},
        modeltype="keras_multitask")
    target_aucs = compute_roc_auc_scores(results, task_types)
    target_r2s = compute_r2_scores(results, task_types)
    target_rms = compute_rms_scores(results, task_types)
  #  target_aucs = compute_roc_auc_scores(results, task_types)
  #  target_r2s = compute_r2_scores(results, task_types)
  #  target_rms = compute_rms_scores(results, task_types)

    aucs.update(target_aucs)
    r2s.update(target_r2s)
    rms.update(target_rms)
  if aucs:
    print aucs
    print "Mean AUC: %f" % np.mean(np.array(aucs.values()))
  if r2s:
    print r2s
    print "Mean R^2: %f" % np.mean(np.array(r2s.values()))
  if rms:
    print rms
    print "Mean RMS: %f" % np.mean(np.array(rms.values()))
  #  aucs.update(target_aucs)
  #  r2s.update(target_r2s)
  #  rms.update(target_rms)
  #if aucs:
  #  print aucs
  #  print "Mean AUC: %f" % np.mean(np.array(aucs.values()))
  #if r2s:
  #  print r2s
  #  print "Mean R^2: %f" % np.mean(np.array(r2s.values()))
  #if rms:
  #  print rms
  #  print "Mean RMS: %f" % np.mean(np.array(rms.values()))

def train_multitask_model(X, y, W, task_types,
  learning_rate=0.01, decay=1e-6, momentum=0.9, nesterov=True, activation="relu",
+20 −17
Original line number Diff line number Diff line
@@ -9,6 +9,7 @@ from deep_chem.utils.preprocess import train_test_random_split
from deep_chem.utils.preprocess import train_test_scaffold_split
from deep_chem.utils.preprocess import dataset_to_numpy
from deep_chem.utils.evaluate import eval_model
from deep_chem.utils.evaluate import size_eval_model
from deep_chem.utils.evaluate import compute_r2_scores
from deep_chem.utils.evaluate import compute_rms_scores
from deep_chem.utils.evaluate import compute_roc_auc_scores
@@ -82,25 +83,27 @@ def fit_singletask_models(paths, modeltype, task_types, task_transforms,
    else:
      raise ValueError("Invalid model type provided.")
    model.fit(X_train, y_train.ravel())
    results = eval_model(test, model, {target: task_types[target]},
    #results = eval_model(test, model, {target: task_types[target]},
    #    modeltype="sklearn")
    results = size_eval_model(test, model, {target: task_types[target]},
        modeltype="sklearn")

    target_aucs = compute_roc_auc_scores(results, task_types)
    target_r2s = compute_r2_scores(results, task_types)
    target_rms = compute_rms_scores(results, task_types)
    
    aucs.update(target_aucs)
    r2s.update(target_r2s)
    rms.update(target_rms)
  if aucs:
    print results_to_csv(aucs)
    print "Mean AUC: %f" % np.mean(np.array(aucs.values()))
  if r2s:
    print results_to_csv(r2s)
    print "Mean R^2: %f" % np.mean(np.array(r2s.values()))
  if rms:
    print results_to_csv(rms)
    print "Mean RMS: %f" % np.mean(np.array(rms.values()))
  #  target_aucs = compute_roc_auc_scores(results, task_types)
  #  target_r2s = compute_r2_scores(results, task_types)
  #  target_rms = compute_rms_scores(results, task_types)
  #  
  #  aucs.update(target_aucs)
  #  r2s.update(target_r2s)
  #  rms.update(target_rms)
  #if aucs:
  #  print results_to_csv(aucs)
  #  print "Mean AUC: %f" % np.mean(np.array(aucs.values()))
  #if r2s:
  #  print results_to_csv(r2s)
  #  print "Mean R^2: %f" % np.mean(np.array(r2s.values()))
  #if rms:
  #  print results_to_csv(rms)
  #  print "Mean RMS: %f" % np.mean(np.array(rms.values()))


def fit_multitask_rf(dataset, splittype="random"):
+1 −1
Original line number Diff line number Diff line
@@ -12,7 +12,7 @@ def parse_args(input_args=None):
  """Parse command-line arguments."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--datasets', required=1, nargs="+",
                      choices=['muv', 'pcba', 'dude', 'pfizer', 'globavir'],
                      choices=['muv', 'pcba', 'dude', 'pfizer', 'globavir', 'pdbbind'],
                      help='Name of dataset to process.')
  parser.add_argument("--paths", required=1, nargs="+",
                      help = "Paths to input datasets.")
+64 −0
Original line number Diff line number Diff line
@@ -12,6 +12,8 @@ from deep_chem.utils.preprocess import labels_to_weights
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score
from rdkit import Chem
from rdkit.Chem.Descriptors import ExactMolWt

def model_predictions(test_set, model, n_targets, task_types,
    modeltype="sklearn"):
@@ -59,6 +61,68 @@ def model_predictions(test_set, model, n_targets, task_types,
    ypreds = [ypreds]
  return ypreds

def size_eval_model(test_set, model, task_types, modeltype="sklearn"):
  """Split test set based on size of molecule."""
  weights = {}
  for smiles in test_set:
    weights[smiles] = ExactMolWt(Chem.MolFromSmiles(smiles))
  #print weights
  weight_arr = np.array(weights.values())
  print "mean: " + str(np.mean(weight_arr))
  print "std: " + str(np.std(weight_arr))
  print "max: " + str(np.amax(weight_arr))
  print "min: " + str(np.amin(weight_arr))
  buckets = {250: {}, 500: {}, 750: {}, 1000: {}, 1250: {}, 1500: {}, 1750: {}, 2000: {}, 2250: {}, 2500: {}, 2750: {}}
  buckets_to_labels = {250: "0-250", 500: "250-500", 750: "500-750", 1000: "750-1000", 1250: "1000-1250", 1500: "1250-1500", 1750: "1500-1750", 2000: "1750-2000", 2250: "2000-2250", 2500: "2250-2500", 2750: "2500-2750"}
  for smiles in test_set:
    weight = weights[smiles]
    if weight < 250:
      buckets[250][smiles] = test_set[smiles]
    elif weight < 500:
      buckets[500][smiles] = test_set[smiles]
    elif weight < 750:
      buckets[750][smiles] = test_set[smiles]
    elif weight < 1000:
      buckets[1000][smiles] = test_set[smiles]
    elif weight < 1250:
      buckets[1250][smiles] = test_set[smiles]
    elif weight < 1500:
      buckets[1500][smiles] = test_set[smiles]
    elif weight < 1750:
      buckets[1750][smiles] = test_set[smiles]
    elif weight < 2000:
      buckets[2000][smiles] = test_set[smiles]
    elif weight < 2250:
      buckets[2250][smiles] = test_set[smiles]
    elif weight < 2500:
      buckets[2500][smiles] = test_set[smiles]
    elif weight < 2750:
      buckets[2750][smiles] = test_set[smiles]
    else:
      raise ValueError("High Weight: " + str(weight))
  for weight_class in sorted(buckets.keys()):
    test_bucket = buckets[weight_class]
    if len(test_bucket) == 0:
      continue
    print "Evaluating model for %s dalton molecules" % buckets_to_labels[weight_class]
    print "%d compounds in bucket" % len(test_bucket)
    results = eval_model(test_bucket, model, task_types, modeltype=modeltype)

    target_r2s = compute_r2_scores(results, task_types)
    target_rms = compute_rms_scores(results, task_types)
    print "R^2: " + str(target_r2s)
    print "RMS: " + str(target_rms)
  
  print "Performing Global Evaluation"
  results = eval_model(test_set, model, task_types, modeltype=modeltype)
  target_r2s = compute_r2_scores(results, task_types)
  target_rms = compute_rms_scores(results, task_types)
  print "R^2: " + str(target_r2s)
  print "RMS: " + str(target_rms)
    
  

  
def eval_model(test_set, model, task_types, modeltype="sklearn"):
  """Evaluates the provided model on the test-set.

+14 −7
Original line number Diff line number Diff line
@@ -35,7 +35,11 @@ def get_default_task_types_and_transforms(dataset_specs):
        task_types[target] = "regression"
        task_transforms[target] = ["normalize"]
    elif name == "pdbbind":
      raise ValueError("pdbbind not yet supported!")
      for target in targets:
        task_types[target] = "regression"
        task_transforms[target] = ["normalize"]
        #task_transforms[target] = []
  
  return task_types, task_transforms

def load_descriptors(paths, descriptor_dir_name="descriptors"):
@@ -67,7 +71,7 @@ def load_descriptors(paths, descriptor_dir_name="descriptors"):
                  index not in bad_sets])
  return descriptor_dict

def load_molecules(paths, dir_name="circular-scaffold-smiles"):
def load_molecules(paths, dir_name="fingerprints"):
  """Load dataset fingerprints and return fingerprints.

  Returns a dictionary that maps smiles strings to dicts that contain
@@ -137,15 +141,18 @@ def load_assays(paths, target_dir_name="targets"):
      target_name = target_pickle.split(".")[0]
      with gzip.open(os.path.join(target_dir, target_pickle), "rb") as f:
        contents = pickle.load(f)
        # TODO(rbharath): Make endpoint a flag that can be passed in.
        if "potency" in contents:
          items = zip(contents["smiles"], contents["potency"])
          endpoint = "potency" 
        elif "targets" in contents:
          items = zip(contents["smiles"], contents["targets"])
        # TODO(rbharath): Remove this horrible special purpose code.
          endpoint = "targets"  
        elif "label" in contents:
          endpoint = "label"
        elif "tdo_percent_activity_10_um" in contents:
          items = zip(contents["smiles"], contents["tdo_percent_activity_10_um"])
          endpoint = "tdo_percent_activity_10_um"
        else:
          raise ValueError("Must contain recognized measurement.")
        items = zip(contents["smiles"], contents[endpoint])
        for smiles, measurement in items:
          # TODO(rbharath): Get a less kludgey answer
          # TODO(rbharath): There is some amount of duplicate collisions
@@ -197,7 +204,7 @@ def load_pdbbind_datasets(pdbbind_paths):
  return df

def load_vs_datasets(paths, target_dir_name="targets",
    fingerprint_dir_name="circular-scaffold-smiles"):
    fingerprint_dir_name="fingerprints"):
  """Load both labels and fingerprints.

  Returns a dictionary that maps smiles to pairs of (fingerprint, labels)
Loading