Commit 467aa60e authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Second batch of changes. Descriptors now work. Code for specified train/test...

Second batch of changes. Descriptors now work. Code for specified train/test in place, but not working yet.
parent 93ece013
Loading
Loading
Loading
Loading
+9 −109
Original line number Diff line number Diff line
@@ -11,124 +11,32 @@ from keras.optimizers import SGD
from deep_chem.utils.load import load_datasets
from deep_chem.utils.load import ensure_balanced
from deep_chem.utils.preprocess import multitask_to_singletask
from deep_chem.utils.preprocess import train_test_random_split
from deep_chem.utils.preprocess import train_test_scaffold_split
from deep_chem.utils.preprocess import split_dataset
from deep_chem.utils.preprocess import dataset_to_numpy
from deep_chem.utils.preprocess import to_one_hot
from deep_chem.utils.preprocess import process_multitask_dataset
from deep_chem.utils.evaluate import eval_model
from deep_chem.utils.evaluate import compute_r2_scores
from deep_chem.utils.evaluate import compute_rms_scores
from deep_chem.utils.evaluate import compute_roc_auc_scores
from deep_chem.utils.load import load_and_transform_dataset

def process_multitask(paths, input_transforms, output_transforms, feature_types,
    splittype="random", seed=None, weight_positives=False):
  """Extracts multitask datasets and splits into train/test.

  Returns a tuple of test/train datasets, fingerprints, and labels.

  TODO(rbharath): This function is ugly. Returns way too many arguments. Clean
  it up.

  Parameters
  ----------
  paths: list 
    List of paths to datasets. 
  output_transforms: dict 
    dict mapping target names to label transform. Each output type must be either
    None, "log", "normalize" or "log-normalize". Only for regression outputs.
  splittype: string
    Must be "random" or "scaffold"
  seed: int
    Seed used for random splits.
  """
  dataset = load_and_transform_dataset(paths, input_transforms, output_transforms,
			prediction_endpoint, feature_types=feature_types,
      weight_positives=weight_positives)
  sorted_targets = sorted(dataset.keys())
  if splittype == "random":
    train, test = train_test_random_split(dataset, seed=seed)
  elif splittype == "scaffold":
    train, test = train_test_scaffold_split(dataset)
  else:
    raise ValueError("Improper splittype. Must be random/scaffold.")
  X_train, y_train, W_train = dataset_to_numpy(train)
  ## TODO(rbharath): Still need to fix the failures for PCBA. Temporarily
  ## commenting out to experiment.
  #if weight_positives:
  #  print "Train set balance"
  #  ensure_balanced(y_train, W_train)
  X_test, y_test, W_test = dataset_to_numpy(test)
  #if weight_positives:
  #  print "Test set balance"
  #  ensure_balanced(y_test, W_test)
  return (train, X_train, y_train, W_train, test, X_test, y_test, W_test)

def process_singletask(paths, input_transforms, output_transforms,
		prediction_endpoint, feature_types,
		splittype="random", seed=None,
    weight_positives=True):
  """Extracts singletask datasets and splits into train/test.

  Returns a dict that maps target names to tuples.

  Parameters
  ----------
  paths: list 
    List of paths to Google vs datasets. 
  output_transforms: dict 
    dict mapping target names to label transform. Each output type must be either
    None or "log". Only for regression outputs.
  splittype: string
    Must be "random" or "scaffold"
  seed: int
    Seed used for random splits.
  """
  dataset = load_and_transform_dataset(paths, input_transforms, output_transforms,
			prediction_endpoint, feature_types=feature_types,
      weight_positives=weight_positives)
  singletask = multitask_to_singletask(dataset)
  arrays = {}
  for target in singletask:
    data = singletask[target]
    if len(data) == 0:
      continue
    if splittype == "random":
      train, test = train_test_random_split(data, seed=seed)
    elif splittype == "scaffold":
      train, test = train_test_scaffold_split(data)
    else:
      raise ValueError("Improper splittype. Must be random/scaffold.")
    X_train, y_train, W_train = dataset_to_numpy(train)
    X_test, y_test, W_test = dataset_to_numpy(test)
    arrays[target] = (train, X_train, y_train, W_train), (test, X_test, y_test, W_test)
  return arrays


def fit_multitask_mlp(paths, task_types, input_transforms, output_transforms,
                      prediction_endpoint, feature_types, splittype="random",
                      weight_positives=False, **training_params):
def fit_multitask_mlp(train_data, test_data, task_types, **training_params):
  """
  Perform stochastic gradient descent optimization for a keras multitask MLP.
  Returns AUCs, R^2 scores, and RMS values.

  Parameters
  ----------
  paths: list 
    List of paths to Google vs datasets. 
  task_types: dict 
    dict mapping target names to output type. Each output type must be either
    "classification" or "regression".
  output_transforms: dict 
    dict mapping target names to label transform. Each output type must be either
    None, "log", "normalize", or "log-normalize". Only for regression outputs.
  training_params: dict
    Aggregates keyword parameters to pass to train_multitask_model
  """
  (train, X_train, y_train, W_train), (test, X_test, y_test, W_test) = (
      process_multitask(paths, input_transforms, output_transforms, feature_types, splittype=splittype,
                        weight_positives=weight_positives))
  print np.shape(y_train)
      train_data, test_data)
  model = train_multitask_model(X_train, y_train, W_train, task_types,
                                **training_params)
  results = eval_model(test, model, task_types,
@@ -141,16 +49,10 @@ def fit_multitask_mlp(paths, task_types, input_transforms, output_transforms,
  if r2s:
    print "Mean R^2: %f" % np.mean(np.array(r2s.values()))

def fit_singletask_mlp(paths, task_types, input_transforms, output_transforms,
											 prediction_endpoint,
                       feature_types,
                       splittype="random", weight_positives=True,
                       num_to_train=None, **training_params):
def fit_singletask_mlp(per_task_data, task_types, num_to_train=None, **training_params):
  """
  Perform stochastic gradient descent optimization for a keras MLP.

  paths: list 
    List of paths to Google vs datasets. 
  task_types: dict 
    dict mapping target names to output type. Each output type must be either
    "classification" or "regression".
@@ -160,19 +62,16 @@ def fit_singletask_mlp(paths, task_types, input_transforms, output_transforms,
  training_params: dict
    Aggregates keyword parameters to pass to train_multitask_model
  """
  singletasks = process_singletask(paths, input_transforms, output_transforms,
		prediction_endpoint, feature_types,
    splittype=splittype, weight_positives=weight_positives)
  ret_vals = {}
  aucs, r2s, rms = {}, {}, {}
  sorted_targets = sorted(singletasks.keys())
  sorted_targets = sorted(per_task_data.keys())
  if num_to_train:
    sorted_targets = sorted_targets[:num_to_train]
  for index, target in enumerate(sorted_targets):
    print "Training model %d" % index
    print "Target %s" % target
    (train, X_train, y_train, W_train), (test, X_test, y_test, W_test) = (
        singletasks[target])
        per_task_data[target])
    model = train_multitask_model(X_train, y_train, W_train,
        {target: task_types[target]}, **training_params)
    results = eval_model(test, model, {target: task_types[target]}, 
@@ -198,7 +97,7 @@ def fit_singletask_mlp(paths, task_types, input_transforms, output_transforms,

def train_multitask_model(X, y, W, task_types,
  learning_rate=0.01, decay=1e-6, momentum=0.9, nesterov=True, activation="relu",
  dropout=0.5, nb_epoch=20, batch_size=50, n_hidden=500, n_inputs=1024,
  dropout=0.5, nb_epoch=20, batch_size=50, n_hidden=500,
  validation_split=0.1):
  """
  Perform stochastic gradient descent optimization for a keras multitask MLP.
@@ -231,6 +130,7 @@ def train_multitask_model(X, y, W, task_types,
  sorted_targets = sorted(task_types.keys())
  local_task_types = task_types.copy()
  endpoints = sorted_targets
  (_, n_inputs) = np.shape(X)
  # Add eps weight to avoid minibatches with zero weight (causes theano to crash).
  W = W + eps * np.ones(np.shape(W))
  model = Graph()
+3 −32
Original line number Diff line number Diff line
@@ -7,47 +7,18 @@ from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution3D, MaxPooling3D
from keras.utils import np_utils
from deep_chem.utils.preprocess import train_test_random_split
from deep_chem.utils.preprocess import split_dataset
from deep_chem.utils.load import load_and_transform_dataset
from deep_chem.utils.preprocess import tensor_dataset_to_numpy
from deep_chem.utils.evaluate import eval_model
from deep_chem.utils.evaluate import compute_r2_scores

# TODO(rbharath): Factor this out into a separate function in utils. Duplicates
# code in deep.py
# TODO(rbharath): paths is to handle sharded input pickle files. Might be
# better to use hdf5 datasets like in MSMBuilder
def process_3D_convolutions(paths, input_transforms, output_transforms, prediction_endpoint,
                            feature_types, seed=None, splittype="random"):
  """Loads 3D Convolution datasets.

  Parameters
  ----------
  paths: list
    List of paths to convolution datasets.
  """
  dataset = load_and_transform_dataset(paths, input_transforms, output_transforms,
    prediction_endpoint, feature_types=feature_types, datatype="pdbbind")
  # TODO(rbharath): Factor this code splitting out into a util function.
  if splittype == "random":
    train, test = train_test_random_split(dataset, seed=seed)
  elif splittype == "scaffold":
    train, test = train_test_scaffold_split(dataset)
  X_train, y_train, W_train = tensor_dataset_to_numpy(train)
  X_test, y_test, W_test = tensor_dataset_to_numpy(test)
  return (X_train, y_train, W_train, train), (X_test, y_test, W_test, test)

def fit_3D_convolution(paths, task_types, input_transforms, output_transforms, prediction_endpoint,
    feature_types, axis_length=32, **training_params):
def fit_3D_convolution(train_data, test_data, task_types, axis_length=32, **training_params):
  """
  Perform stochastic gradient descent for a 3D CNN.
  """
  (X_train, y_train, W_train, train), (X_test, y_test, W_test, test) = (
      process_3D_convolutions(paths, input_transforms, output_transforms, prediction_endpoint,
                              feature_types))

  print "np.shape(X_train): " + str(np.shape(X_train))
  print "np.shape(y_train): " + str(np.shape(y_train))
      train_data, test_data)

  nb_classes = 2
  model = train_3D_convolution(X_train, y_train, axis_length, **training_params)
+13 −39
Original line number Diff line number Diff line
@@ -4,9 +4,7 @@ Code for processing datasets using scikit-learn.
import numpy as np
from deep_chem.utils.analysis import results_to_csv
from deep_chem.utils.load import load_and_transform_dataset
from deep_chem.utils.preprocess import multitask_to_singletask
from deep_chem.utils.preprocess import train_test_random_split
from deep_chem.utils.preprocess import train_test_scaffold_split
from deep_chem.utils.preprocess import split_dataset
from deep_chem.utils.preprocess import dataset_to_numpy
from deep_chem.utils.evaluate import eval_model
from deep_chem.utils.evaluate import compute_r2_scores
@@ -23,8 +21,8 @@ from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LassoLarsCV
from sklearn.svm import SVR

def fit_singletask_models(paths, modeltype, task_types, task_transforms,
    splittype="random", seed=None, num_to_train=None):
def fit_singletask_models(per_task_data, modeltype, task_types,
    num_to_train=None):
  """Fits singletask linear regression models to potency.

  Parameters
@@ -40,27 +38,18 @@ def fit_singletask_models(paths, modeltype, task_types, task_transforms,
  task_types: dict 
    dict mapping target names to output type. Each output type must be either
    "classification" or "regression".
  task_transforms: dict 
  output_transforms: dict 
    dict mapping target names to label transform. Each output type must be either
    None or "log". Only for regression outputs.
  """
  dataset = load_and_transform_dataset(paths, task_transforms)
  singletask = multitask_to_singletask(dataset)
  aucs, r2s, rms = {}, {}, {}
  sorted_targets = sorted(singletask.keys())
  sorted_targets = sorted(per_task_data.keys())
  if num_to_train:
    sorted_targets = sorted_targets[:num_to_train]
  for index, target in enumerate(sorted_targets):
    print "Building model %d" % index
    data = singletask[target]
    if splittype == "random":
      train, test = train_test_random_split(data, seed=seed)
    elif splittype == "scaffold":
      train, test = train_test_scaffold_split(data)
    else:
      raise ValueError("Improper splittype. Must be random/scaffold.")
    X_train, y_train, W_train = dataset_to_numpy(train)
    X_test, y_test, W_test = dataset_to_numpy(test)
    (train, X_train, y_train, W_train), (test, X_test, y_test, W_test) = (
        per_task_data[target])
    if modeltype == "rf_regressor":
      model = RandomForestRegressor(n_estimators=500, n_jobs=-1,
          warm_start=True, max_features="sqrt")
@@ -102,29 +91,14 @@ def fit_singletask_models(paths, modeltype, task_types, task_transforms,
    print results_to_csv(rms)
    print "Mean RMS: %f" % np.mean(np.array(rms.values()))


def fit_multitask_rf(dataset, splittype="random"):
def fit_multitask_rf(train_data, test_data, task_types):
  """Fits a multitask RF model to provided dataset.

  Performs a random 80-20 train/test split.

  Parameters
  ----------
  dataset: dict 
    A dictionary of type produced by load_datasets. 
  splittype: string
    Type of split for train/test. Either random or scaffold.
  """
  if splittype == "random":
    train, test = train_test_random_split(data, seed=0)
  elif splittype == "scaffold":
    train, test = train_test_scaffold_split(data)
  else:
    raise ValueError("Improper splittype. Must be random/scaffold.")
  X_train, y_train, W_train = dataset_to_numpy(train)
  classifier = RandomForestClassifier(n_estimators=100, n_jobs=-1,
  (train, X_train, y_train, W_train), (test, X_train, y_train, W_train) = (
      train_data, test_data) 
  model = RandomForestClassifier(n_estimators=100, n_jobs=-1,
      class_weight="auto")
  classifier.fit(X_train, y_train)
  results = eval_model(test, classifier)
  model.fit(X_train, y_train)
  results = eval_model(test, model, task_types)
  scores = compute_roc_auc_scores(results)
  print "Mean AUC: %f" % np.mean(np.array(scores.values()))
+28 −38
Original line number Diff line number Diff line
@@ -14,11 +14,6 @@ from deep_chem.utils.load import get_target_names
def parse_args(input_args=None):
  """Parse command-line arguments."""
  parser = argparse.ArgumentParser()
  #parser.add_argument("--datasets", nargs="+", required=1,
  #                    choices=["muv", "pcba", "dude", "pfizer", "globavir", "pdbbind"],
  #                    help="Name of dataset to process.")
  #parser.add_argument("--dataset-names", nargs="+", required=1,
  #                    help="Names of datasets to process.")
  parser.add_argument("--task-type", default="classification",
                      choices=["classification", "regression"],
                      help="Type of learning task.")
@@ -33,20 +28,23 @@ def parse_args(input_args=None):
                      help="Types of featurizations to use.")
  parser.add_argument("--paths", nargs="+", required=1,
                      help = "Paths to input datasets.")
  parser.add_argument("--mode", default="singletask",
                      choices=["singletask", "multitask"],
                      "Type of model being built.")
  parser.add_argument("--model", required=1,
                      choices=["logistic", "rf_classifier", "rf_regressor",
                      "linear", "ridge", "lasso", "lasso_lars", "elastic_net",
                      "singletask_deep_network", "multitask_deep_network",
                      "3D_cnn"])
  parser.add_argument("--splittype", type=str, default="scaffold",
                       choices=["scaffold", "random"],
                       help="Type of cross-validation data-splitting.")
                       choices=["scaffold", "random", "specified"],
                       help="Type of train/test data-splitting.\n"
                            "scaffold uses Bemis-Murcko scaffolds.\n"
                            "specified requires that split be in original data.")
  parser.add_argument("--prediction-endpoint", type=str, required=1,
                       help="Name of measured endpoint to predict.")
  # TODO(rbharath): There should be a way to directly compute n-input from the
  # provided feature-types?
  parser.add_argument("--n-inputs", type=int, default=1024,
                      help="Number of input features for models.")
  parser.add_argument("--split-endpoint", type=str, default=None,
                       help="Name of endpoint specifying train/test split.")
  parser.add_argument("--n-hidden", type=int, default=500,
                      help="Number of hidden neurons for NN models.")
  parser.add_argument("--learning-rate", type=float, default=0.01,
@@ -77,51 +75,43 @@ def main():

  paths = args.paths

  print paths
  targets = get_target_names(paths)
  task_types = {target: args.task_type for target in targets}
  input_transforms = args.input_transforms 
  print "input_transforms"
  print input_transforms
  output_transforms = {target: args.output_transforms for target in targets}

  # TODO(rbharath): Too many settings are explicitly passed down here. Is there
  # a good way to pass down configuration parameters into the invocations
  # below.
  datatype = "tensor" if args.model == "3D_cnn" else "vector"
  processed = process_datasets(paths,
      input_transforms, output_transforms, feature_types, 
      prediction_endpoint=prediction_endpoint,
      split_endpoint=split_endpoint,
      splittype=splittype, weight_positives=weight_positives,
      datatype=datatype)
  if args.mode == "multitask":
    train_data, test_data = processed
  else:
    per_task_data = processed
  # TODO(rbharath): Bundle training params into a training_param dict that's passed
  # down to these functions.
  if args.model == "singletask_deep_network":
    fit_singletask_mlp(paths, task_types, input_transforms, output_transforms,
      prediction_endpoint=args.prediction_endpoint,
      feature_types=args.feature_types,
      splittype=args.splittype, 
      n_inputs=args.n_inputs,
      n_hidden=args.n_hidden,
    fit_singletask_mlp(per_task_data, task_types, n_hidden=args.n_hidden,
      learning_rate=args.learning_rate, dropout=args.dropout,
      nb_epoch=args.n_epochs, decay=args.decay, batch_size=args.batch_size,
      validation_split=args.validation_split,
      weight_positives=args.weight_positives, num_to_train=args.num_to_train)
  elif args.model == "multitask_deep_network":
    fit_multitask_mlp(paths, task_types, input_transforms, output_transforms,
      prediction_endpoint=args.prediction_endpoint,
      feature_types=args.feature_types,
      splittype=args.splittype,
      n_inputs=args.n_inputs,
      n_hidden=args.n_hidden, learning_rate =
      args.learning_rate, dropout = args.dropout, batch_size=args.batch_size,
    fit_multitask_mlp(train_data, test_data, task_types,
      n_hidden=args.n_hidden, learning_rate = args.learning_rate,
      dropout = args.dropout, batch_size=args.batch_size,
      nb_epoch=args.n_epochs, decay=args.decay,
      validation_split=args.validation_split,
      weight_positives=args.weight_positives)
  elif args.model == "3D_cnn":
    fit_3D_convolution(paths, task_types, input_transforms, output_transforms,
        feature_types=args.feature_types,
        prediction_endpoint=args.prediction_endpoint,
    fit_3D_convolution(train_data, test_data, task_types,
        axis_length=args.axis_length, nb_epoch=args.n_epochs,
        batch_size=args.batch_size)
  else:
    print args.prediction_endpoint
    fit_singletask_models(paths, args.model, task_types,
        input_transforms, output_transforms, prediction_endpoint=args.prediction_endpoint,
        feature_types=args.feature_types, splittype=args.splittype,
        num_to_train=args.num_to_train)
    fit_singletask_models(per_task_data,  args.model, task_types, num_to_train=args.num_to_train)

if __name__ == "__main__":
  main()
+0 −1
Original line number Diff line number Diff line
@@ -215,6 +215,5 @@ def main():
  generate_fingerprints(args.name, args.out)
  generate_descriptors(args.name, args.out)


if __name__ == "__main__":
  main()
Loading