Commit f6b13490 authored by Evan N. Feinberg's avatar Evan N. Feinberg
Browse files

Merge pull request #16 from rbharath/fixes

Major Refactoring of deep-chem
parents 1d396a11 480514ff
Loading
Loading
Loading
Loading
+14 −59
Original line number Diff line number Diff line
@@ -2,26 +2,15 @@
Code for processing the Google vs-datasets using keras.
"""
import numpy as np
import sys
import keras
from keras.models import Graph
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD
from deep_chem.utils.load import load_datasets
from deep_chem.utils.load import ensure_balanced
from deep_chem.utils.preprocess import multitask_to_singletask
from deep_chem.utils.preprocess import split_dataset
from deep_chem.utils.preprocess import dataset_to_numpy
from deep_chem.utils.preprocess import to_one_hot
from deep_chem.utils.evaluate import eval_model
from deep_chem.utils.evaluate import compute_r2_scores
from deep_chem.utils.evaluate import compute_rms_scores
from deep_chem.utils.evaluate import compute_roc_auc_scores
from deep_chem.utils.load import load_and_transform_dataset


def fit_multitask_mlp(train_data, test_data, task_types, **training_params):
def fit_multitask_mlp(train_data, task_types, **training_params):
  """
  Perform stochastic gradient descent optimization for a keras multitask MLP.
  Returns AUCs, R^2 scores, and RMS values.
@@ -34,22 +23,15 @@ def fit_multitask_mlp(train_data, test_data, task_types, **training_params):
  training_params: dict
    Aggregates keyword parameters to pass to train_multitask_model
  """
  (train, X_train, y_train, W_train), (test, X_test, y_test, W_test) = (
      train_data, test_data)
  model = train_multitask_model(X_train, y_train, W_train, task_types,
  models = {}
  # Follows convention from process_datasets that the data for multitask models
  # is grouped under key "all"
  (_, X_train, y_train, W_train) = train_data["all"]
  models["all"] = train_multitask_model(X_train, y_train, W_train, task_types,
                                **training_params)
  results = eval_model(test, model, task_types,
      modeltype="keras_multitask")
  local_task_types = task_types.copy()
  aucs = compute_roc_auc_scores(results, local_task_types)
  if aucs:
    print "Mean AUC: %f" % np.mean(np.array(aucs.values()))
  r2s = compute_r2_scores(results, local_task_types)
  if r2s:
    print "Mean R^2: %f" % np.mean(np.array(r2s.values()))
  return results
  return models

def fit_singletask_mlp(per_task_data, task_types, num_to_train=None, **training_params):
def fit_singletask_mlp(train_data, task_types, **training_params):
  """
  Perform stochastic gradient descent optimization for a keras MLP.

@@ -62,42 +44,15 @@ def fit_singletask_mlp(per_task_data, task_types, num_to_train=None, **training_
  training_params: dict
    Aggregates keyword parameters to pass to train_multitask_model
  """
  ret_vals = {}
  aucs, r2s, rms = {}, {}, {}
  sorted_targets = sorted(per_task_data.keys())
  if num_to_train:
    sorted_targets = sorted_targets[:num_to_train]
  all_results = {}
  for index, target in enumerate(sorted_targets):
  models = {}
  for index, target in enumerate(sorted(train_data.keys())):
    print "Training model %d" % index
    print "Target %s" % target
    (train, X_train, y_train, W_train), (test, X_test, y_test, W_test) = (
        per_task_data[target])
    print "len(train)"
    print len(train)
    print "len(test)"
    print len(test)
    model = train_multitask_model(X_train, y_train, W_train,
    (train_ids, X_train, y_train, W_train) = train_data[target]
    print "%d compounds in Train" % len(train_ids)
    models[target] = train_multitask_model(X_train, y_train, W_train,
        {target: task_types[target]}, **training_params)
    results = eval_model(test, model, {target: task_types[target]}, 
                         # We run singletask models as special cases of
                         # multitask.
                         modeltype="keras_multitask")
    all_results[target] = results[target]
    target_aucs = compute_roc_auc_scores(results, task_types)
    target_r2s = compute_r2_scores(results, task_types)
    target_rms = compute_rms_scores(results, task_types)

    aucs.update(target_aucs)
    r2s.update(target_r2s)
    rms.update(target_rms)
  if aucs:
    print aucs
    print "Mean AUC: %f" % np.mean(np.array(aucs.values()))
  if r2s:
    print r2s
    print "Mean R^2: %f" % np.mean(np.array(r2s.values()))
  return all_results
  return models

def train_multitask_model(X, y, W, task_types,
  learning_rate=0.01, decay=1e-6, momentum=0.9, nesterov=True, activation="relu",
+6 −18
Original line number Diff line number Diff line
@@ -6,30 +6,18 @@ from keras.optimizers import RMSprop
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution3D, MaxPooling3D
from keras.utils import np_utils
from deep_chem.utils.preprocess import split_dataset
from deep_chem.utils.load import load_and_transform_dataset
from deep_chem.utils.preprocess import tensor_dataset_to_numpy
from deep_chem.utils.evaluate import eval_model
from deep_chem.utils.evaluate import compute_r2_scores

def fit_3D_convolution(train_data, test_data, task_types, axis_length=32, **training_params):
def fit_3D_convolution(per_task_data, task_types, **training_params):
  """
  Perform stochastic gradient descent for a 3D CNN.
  """
  (X_train, y_train, W_train, train), (X_test, y_test, W_test, test) = (
      train_data, test_data)

  models = {}
  (_, X_train, y_train, _), _ = per_task_data["all"]
  nb_classes = 2
  model = train_3D_convolution(X_train, y_train, axis_length, **training_params)
  results = eval_model(test, model, task_types,
      modeltype="keras", mode="tensor")
  local_task_types = task_types.copy()
  r2s = compute_r2_scores(results, local_task_types)
  print "Mean R^2: %f" % np.mean(np.array(r2s.values()))
  return results
  models["all"] = train_3D_convolution(X_train, y_train, **training_params)
  return models

def train_3D_convolution(X, y, axis_length=32, batch_size=50, nb_epoch=1):
def train_3D_convolution(X, y, batch_size=50, nb_epoch=1):
  """
  Fit a keras 3D CNN to datat.

+11 −45
Original line number Diff line number Diff line
@@ -2,14 +2,6 @@
Code for processing datasets using scikit-learn.
"""
import numpy as np
from deep_chem.utils.analysis import results_to_csv
from deep_chem.utils.load import load_and_transform_dataset
from deep_chem.utils.preprocess import split_dataset
from deep_chem.utils.preprocess import dataset_to_numpy
from deep_chem.utils.evaluate import eval_model
from deep_chem.utils.evaluate import compute_r2_scores
from deep_chem.utils.evaluate import compute_rms_scores
from deep_chem.utils.evaluate import compute_roc_auc_scores
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import MultiTaskLasso 
@@ -21,8 +13,7 @@ from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LassoLarsCV
from sklearn.svm import SVR

def fit_singletask_models(per_task_data, modeltype, task_types,
    num_to_train=None):
def fit_singletask_models(train_data, modeltype, task_types):
  """Fits singletask linear regression models to potency.

  Parameters
@@ -42,15 +33,10 @@ def fit_singletask_models(per_task_data, modeltype, task_types,
    dict mapping target names to label transform. Each output type must be either
    None or "log". Only for regression outputs.
  """
  all_results = {}
  aucs, r2s, rms = {}, {}, {}
  sorted_targets = sorted(per_task_data.keys())
  if num_to_train:
    sorted_targets = sorted_targets[:num_to_train]
  for index, target in enumerate(sorted_targets):
  models = {}
  for index, target in enumerate(sorted(train_data.keys())):
    print "Building model %d" % index
    (train, X_train, y_train, W_train), (test, X_test, y_test, W_test) = (
        per_task_data[target])
    (_, X_train, y_train, W_train) = train_data[target]
    if modeltype == "rf_regressor":
      model = RandomForestRegressor(n_estimators=500, n_jobs=-1,
          warm_start=True, max_features="sqrt")
@@ -72,36 +58,16 @@ def fit_singletask_models(per_task_data, modeltype, task_types,
    else:
      raise ValueError("Invalid model type provided.")
    model.fit(X_train, y_train.ravel())
    results = eval_model(test, model, {target: task_types[target]},
        modeltype="sklearn")
    all_results[target] = results[target]
    models[target] = model
  return models

    target_aucs = compute_roc_auc_scores(results, task_types)
    target_r2s = compute_r2_scores(results, task_types)
    target_rms = compute_rms_scores(results, task_types)
    
    aucs.update(target_aucs)
    r2s.update(target_r2s)
    rms.update(target_rms)
  if aucs:
    print results_to_csv(aucs)
    print "Mean AUC: %f" % np.mean(np.array(aucs.values()))
  if r2s:
    print results_to_csv(r2s)
    print "Mean R^2: %f" % np.mean(np.array(r2s.values()))
  if rms:
    print results_to_csv(rms)
    print "Mean RMS: %f" % np.mean(np.array(rms.values()))
  return all_results

def fit_multitask_rf(train_data, test_data, task_types):
# TODO(rbharath): I believe this is broken. Update it to work with the rest of
# the package.
def fit_multitask_rf(train_data, task_types):
  """Fits a multitask RF model to provided dataset.
  """
  (train, X_train, y_train, W_train), (test, X_train, y_train, W_train) = (
      train_data, test_data) 
  (_, X_train, y_train, _) = train_data
  model = RandomForestClassifier(n_estimators=100, n_jobs=-1,
      class_weight="auto")
  model.fit(X_train, y_train)
  results = eval_model(test, model, task_types)
  scores = compute_roc_auc_scores(results)
  print "Mean AUC: %f" % np.mean(np.array(scores.values()))
  return model
+0 −124
Original line number Diff line number Diff line
"""
Convenience script to train basic models on supported datasets.
"""
import argparse
import numpy as np
from deep_chem.models.deep import fit_singletask_mlp
from deep_chem.models.deep import fit_multitask_mlp
from deep_chem.models.deep3d import fit_3D_convolution
from deep_chem.models.standard import fit_singletask_models
from deep_chem.utils.load import get_target_names
from deep_chem.utils.load import process_datasets
from deep_chem.utils.evaluate import results_to_csv

# TODO(rbharath): Factor this into subcommands. The interface is too
# complicated now to effectively use.
def parse_args(input_args=None):
  """Parse command-line arguments."""
  parser = argparse.ArgumentParser()
  parser.add_argument("--task-type", default="classification",
                      choices=["classification", "regression"],
                      help="Type of learning task.")
  parser.add_argument("--input-transforms", nargs="+", default=[],
                      choices=["normalize", "truncate-outliers"],
                      help="Transforms to apply to input data.")
  parser.add_argument("--output-transforms", nargs="+", default=[],
                      choices=["log", "normalize"],
                      help="Transforms to apply to output data.")
  parser.add_argument("--feature-types", nargs="+", required=1,
                      help="Types of featurizations to use.")
  parser.add_argument("--paths", nargs="+", required=1,
                      help="Paths to input datasets.")
  parser.add_argument("--mode", default="singletask",
                      choices=["singletask", "multitask"],
                      help="Type of model being built.")
  parser.add_argument("--model", required=1,
                      choices=["logistic", "rf_classifier", "rf_regressor",
                      "linear", "ridge", "lasso", "lasso_lars", "elastic_net",
                      "singletask_deep_network", "multitask_deep_network",
                      "3D_cnn"])
  parser.add_argument("--splittype", type=str, default="scaffold",
                       choices=["scaffold", "random", "specified"],
                       help="Type of train/test data-splitting.\n"
                            "scaffold uses Bemis-Murcko scaffolds.\n"
                            "specified requires that split be in original data.")
  parser.add_argument("--csv-out", type=str, default=None,
                  help="Outputted predictions on the test set.")
  #TODO(rbharath): These two arguments (prediction/split-endpoint) should be
  #moved to process_datataset to simplify the invocation here.
  parser.add_argument("--prediction-endpoint", type=str, required=1,
                       help="Name of measured endpoint to predict.")
  parser.add_argument("--split-endpoint", type=str, default=None,
                       help="Name of endpoint specifying train/test split.")
  parser.add_argument("--n-hidden", type=int, default=500,
                      help="Number of hidden neurons for NN models.")
  parser.add_argument("--learning-rate", type=float, default=0.01,
                  help="Learning rate for NN models.")
  parser.add_argument("--dropout", type=float, default=0.5,
                  help="Learning rate for NN models.")
  parser.add_argument("--n-epochs", type=int, default=50,
                  help="Number of epochs for NN models.")
  parser.add_argument("--batch-size", type=int, default=32,
                  help="Number of examples per minibatch for NN models.")
  parser.add_argument("--decay", type=float, default=1e-4,
                  help="Learning rate decay for NN models.")
  parser.add_argument("--validation-split", type=float, default=0.0,
                  help="Percent of training data to use for validation.")
  parser.add_argument("--weight-positives", type=bool, default=False,
                  help="Weight positive examples to have same total weight as negatives.")
  # TODO(rbharath): Remove this once debugging is complete.
  parser.add_argument("--num-to-train", type=int, default=None,
                  help="Number of datasets to train on. Only for debug.")
  parser.add_argument("--axis-length", type=int, default=32,
                  help="Size of a grid axis for 3D CNN input.")
  return parser.parse_args(input_args)

def main():
  args = parse_args()
  paths = {}

  paths = args.paths

  targets = get_target_names(paths)
  task_types = {target: args.task_type for target in targets}
  input_transforms = args.input_transforms 
  output_transforms = {target: args.output_transforms for target in targets}

  datatype = "tensor" if args.model == "3D_cnn" else "vector"
  processed = process_datasets(paths,
      input_transforms, output_transforms, feature_types=args.feature_types, 
      prediction_endpoint=args.prediction_endpoint,
      split_endpoint=args.split_endpoint,
      splittype=args.splittype, weight_positives=args.weight_positives,
      datatype=datatype, mode=args.mode)
  if args.mode == "multitask":
    train_data, test_data = processed
  else:
    per_task_data = processed
  # TODO(rbharath): Bundle training params into a training_param dict that's passed
  # down to these functions.
  if args.model == "singletask_deep_network":
    results = fit_singletask_mlp(per_task_data, task_types, n_hidden=args.n_hidden,
      learning_rate=args.learning_rate, dropout=args.dropout,
      nb_epoch=args.n_epochs, decay=args.decay, batch_size=args.batch_size,
      validation_split=args.validation_split,
      num_to_train=args.num_to_train)
  elif args.model == "multitask_deep_network":
    results = fit_multitask_mlp(train_data, test_data, task_types,
      n_hidden=args.n_hidden, learning_rate = args.learning_rate,
      dropout = args.dropout, batch_size=args.batch_size,
      nb_epoch=args.n_epochs, decay=args.decay,
      validation_split=args.validation_split)
  elif args.model == "3D_cnn":
    results = fit_3D_convolution(train_data, test_data, task_types,
        axis_length=args.axis_length, nb_epoch=args.n_epochs,
        batch_size=args.batch_size)
  else:
    results = fit_singletask_models(per_task_data, args.model, task_types,
                                    num_to_train=args.num_to_train)
  
  if args.csv_out is not None:
    results_to_csv(results, args.csv_out, task_type=args.task_type)

if __name__ == "__main__":
  main()
+278 −0

File added.

Preview size limit exceeded, changes collapsed.

Loading