Commit 9cfaee15 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Adding CSV output support.

parent 1f756744
Loading
Loading
Loading
Loading
+4 −1
Original line number Diff line number Diff line
@@ -47,6 +47,7 @@ def fit_multitask_mlp(train_data, test_data, task_types, **training_params):
  r2s = compute_r2_scores(results, local_task_types)
  if r2s:
    print "Mean R^2: %f" % np.mean(np.array(r2s.values()))
  return results

def fit_singletask_mlp(per_task_data, task_types, num_to_train=None, **training_params):
  """
@@ -61,12 +62,12 @@ def fit_singletask_mlp(per_task_data, task_types, num_to_train=None, **training_
  training_params: dict
    Aggregates keyword parameters to pass to train_multitask_model
  """
  print "ENTERING FIT_SINGLETASK_MLP"
  ret_vals = {}
  aucs, r2s, rms = {}, {}, {}
  sorted_targets = sorted(per_task_data.keys())
  if num_to_train:
    sorted_targets = sorted_targets[:num_to_train]
  all_results = {}
  for index, target in enumerate(sorted_targets):
    print "Training model %d" % index
    print "Target %s" % target
@@ -82,6 +83,7 @@ def fit_singletask_mlp(per_task_data, task_types, num_to_train=None, **training_
                         # We run singletask models as special cases of
                         # multitask.
                         modeltype="keras_multitask")
    all_results[target] = results[target]
    target_aucs = compute_roc_auc_scores(results, task_types)
    target_r2s = compute_r2_scores(results, task_types)
    target_rms = compute_rms_scores(results, task_types)
@@ -95,6 +97,7 @@ def fit_singletask_mlp(per_task_data, task_types, num_to_train=None, **training_
  if r2s:
    print r2s
    print "Mean R^2: %f" % np.mean(np.array(r2s.values()))
  return all_results

def train_multitask_model(X, y, W, task_types,
  learning_rate=0.01, decay=1e-6, momentum=0.9, nesterov=True, activation="relu",
+1 −0
Original line number Diff line number Diff line
@@ -27,6 +27,7 @@ def fit_3D_convolution(train_data, test_data, task_types, axis_length=32, **trai
  local_task_types = task_types.copy()
  r2s = compute_r2_scores(results, local_task_types)
  print "Mean R^2: %f" % np.mean(np.array(r2s.values()))
  return results

def train_3D_convolution(X, y, axis_length=32, batch_size=50, nb_epoch=1):
  """
+3 −0
Original line number Diff line number Diff line
@@ -42,6 +42,7 @@ def fit_singletask_models(per_task_data, modeltype, task_types,
    dict mapping target names to label transform. Each output type must be either
    None or "log". Only for regression outputs.
  """
  all_results = {}
  aucs, r2s, rms = {}, {}, {}
  sorted_targets = sorted(per_task_data.keys())
  if num_to_train:
@@ -73,6 +74,7 @@ def fit_singletask_models(per_task_data, modeltype, task_types,
    model.fit(X_train, y_train.ravel())
    results = eval_model(test, model, {target: task_types[target]},
        modeltype="sklearn")
    all_results[target] = results[target]

    target_aucs = compute_roc_auc_scores(results, task_types)
    target_r2s = compute_r2_scores(results, task_types)
@@ -90,6 +92,7 @@ def fit_singletask_models(per_task_data, modeltype, task_types,
  if rms:
    print results_to_csv(rms)
    print "Mean RMS: %f" % np.mean(np.array(rms.values()))
  return all_results

def fit_multitask_rf(train_data, test_data, task_types):
  """Fits a multitask RF model to provided dataset.
+11 −5
Original line number Diff line number Diff line
@@ -9,6 +9,7 @@ from deep_chem.models.deep3d import fit_3D_convolution
from deep_chem.models.standard import fit_singletask_models
from deep_chem.utils.load import get_target_names
from deep_chem.utils.load import process_datasets
from deep_chem.utils.evaluate import results_to_csv

# TODO(rbharath): Factor this into subcommands. The interface is too
# complicated now to effectively use.
@@ -42,6 +43,8 @@ def parse_args(input_args=None):
                       help="Type of train/test data-splitting.\n"
                            "scaffold uses Bemis-Murcko scaffolds.\n"
                            "specified requires that split be in original data.")
  parser.add_argument("--csv-out", type=str, default=None,
                  help="Outputted predictions on the test set.")
  #TODO(rbharath): These two arguments (prediction/split-endpoint) should be
  #moved to process_datataset to simplify the invocation here.
  parser.add_argument("--prediction-endpoint", type=str, required=1,
@@ -69,7 +72,6 @@ def parse_args(input_args=None):
                  help="Number of datasets to train on. Only for debug.")
  parser.add_argument("--axis-length", type=int, default=32,
                  help="Size of a grid axis for 3D CNN input.")
      
  return parser.parse_args(input_args)

def main():
@@ -97,23 +99,27 @@ def main():
  # TODO(rbharath): Bundle training params into a training_param dict that's passed
  # down to these functions.
  if args.model == "singletask_deep_network":
    fit_singletask_mlp(per_task_data, task_types, n_hidden=args.n_hidden,
    results = fit_singletask_mlp(per_task_data, task_types, n_hidden=args.n_hidden,
      learning_rate=args.learning_rate, dropout=args.dropout,
      nb_epoch=args.n_epochs, decay=args.decay, batch_size=args.batch_size,
      validation_split=args.validation_split,
      num_to_train=args.num_to_train)
  elif args.model == "multitask_deep_network":
    fit_multitask_mlp(train_data, test_data, task_types,
    results = fit_multitask_mlp(train_data, test_data, task_types,
      n_hidden=args.n_hidden, learning_rate = args.learning_rate,
      dropout = args.dropout, batch_size=args.batch_size,
      nb_epoch=args.n_epochs, decay=args.decay,
      validation_split=args.validation_split)
  elif args.model == "3D_cnn":
    fit_3D_convolution(train_data, test_data, task_types,
    results = fit_3D_convolution(train_data, test_data, task_types,
        axis_length=args.axis_length, nb_epoch=args.n_epochs,
        batch_size=args.batch_size)
  else:
    fit_singletask_models(per_task_data, args.model, task_types, num_to_train=args.num_to_train)
    results = fit_singletask_models(per_task_data, args.model, task_types,
                                    num_to_train=args.num_to_train)
  
  if args.csv_out is not None:
    results_to_csv(results, args.csv_out, task_type=args.task_type)

if __name__ == "__main__":
  main()
+28 −13
Original line number Diff line number Diff line
@@ -5,6 +5,7 @@ __author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2015, Stanford University"
__license__ = "LGPL"

import csv
import numpy as np
import warnings
from deep_chem.utils.preprocess import dataset_to_numpy
@@ -157,7 +158,7 @@ def eval_model(test_set, model, task_types, modeltype="sklearn", mode="regular")
      local_task_types, modeltype=modeltype, mode=mode)
  results = {}
  for target in endpoints:
    results[target] = ([], [])  # (ytrue, yscore)
    results[target] = ([], [], [])  # (smiles, ytrue, yscore)
  # Iterate through test set data points.
  for index, smiles in enumerate(sorted(test_set.keys())):
    datapoint = test_set[smiles]
@@ -169,7 +170,8 @@ def eval_model(test_set, model, task_types, modeltype="sklearn", mode="regular")
        and labels[target] == -1):
        continue
      else:
        ytrue, yscore = results[target]
        mol_smiles, ytrue, yscore = results[target]
        mol_smiles.append(smiles)
        if task_type == "classification":
          if labels[target] == 0:
            ytrue.append(0)
@@ -188,10 +190,28 @@ def eval_model(test_set, model, task_types, modeltype="sklearn", mode="regular")
          raise ValueError("task_type must be classification or regression.")
        yscore.append(ypreds[t_ind][index])
  for target in endpoints:
    ytrue, yscore = results[target]
    results[target] = (np.array(ytrue), np.array(yscore))
    mol_smiles, ytrue, yscore = results[target]
    results[target] = (mol_smiles, np.array(ytrue), np.array(yscore))
  return results

def results_to_csv(results, out, task_type="classification"):
  """Writes results as CSV to out."""
  for target in results:
    out_file = "%s-%s.csv" % (out, target)
    mol_smiles, ytrues, yscores= results[target]
    if task_type == "classification":
      yscores = np.around(yscores[:,1]).astype(int)
    elif task_type == "regression":
      print yscores
      yscores = yscores[:,0]
    with open(out_file, "wb") as csvfile:
      csvwriter = csv.writer(csvfile, delimiter="\t")
      csvwriter.writerow(["Smiles", "True", "Model-Prediction"])
      for smiles, ytrue, yscore in zip(mol_smiles, ytrues, yscores):
        csvwriter.writerow([smiles, ytrue, yscore])
    print "Writing results on test set for target %s to %s" % (target, out_file)
    

def compute_roc_auc_scores(results, task_types):
  """Transforms the results dict into roc-auc-scores and prints scores.

@@ -208,18 +228,13 @@ def compute_roc_auc_scores(results, task_types):
  for target in results:
    if task_types[target] != "classification":
      continue
    ytrue, yscore = results[target]
    _, ytrue, yscore = results[target]
    sample_weights = labels_to_weights(ytrue)
    print "np.shape(ytrue)"
    print np.shape(ytrue)
    print "np.shape(yscore)"
    print np.shape(yscore)
    try:
      score = roc_auc_score(ytrue, yscore[:,1], sample_weight=sample_weights)
    except Exception as e:
      warnings.warn("ERROR! ROC_AUC_SCORE CALCULATION FAILED.")
      warnings.warn("ROC AUC score calculation failed.")
      score = 0.5
    #score = roc_auc_score(ytrue, yscore, sample_weight=sample_weights)
    print "Target %s: AUC %f" % (target, score)
    scores[target] = score
  return scores
@@ -240,7 +255,7 @@ def compute_r2_scores(results, task_types):
  for target in results:
    if task_types[target] != "regression":
      continue
    ytrue, yscore = results[target]
    _, ytrue, yscore = results[target]
    score = r2_score(ytrue, yscore)
    print "Target %s: R^2 %f" % (target, score)
    scores[target] = score
@@ -262,7 +277,7 @@ def compute_rms_scores(results, task_types):
  for target in results:
    if task_types[target] != "regression":
      continue
    ytrue, yscore = results[target]
    _, ytrue, yscore = results[target]
    rms = np.sqrt(mean_squared_error(ytrue, yscore))
    print "Target %s: RMS %f" % (target, rms)
    scores[target] = rms