Commit 6d4f6df0 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Merge pull request #32 from rbharath/multitask

Multitask Training with modeler
parents 63ada38a ee5cccc0
Loading
Loading
Loading
Loading
+53 −42
Original line number Diff line number Diff line
@@ -2,10 +2,8 @@
Code for processing the Google vs-datasets using keras.
"""
import numpy as np
import keras
from keras.models import Graph
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.core import Dense, Dropout
from keras.optimizers import SGD
from deep_chem.utils.preprocess import to_one_hot

@@ -18,7 +16,7 @@ def fit_multitask_mlp(train_data, task_types, **training_params):
  Parameters
  ----------
  task_types: dict
    dict mapping target names to output type. Each output type must be either
    dict mapping task names to output type. Each output type must be either
    "classification" or "regression".
  training_params: dict
    Aggregates keyword parameters to pass to train_multitask_model
@@ -26,7 +24,8 @@ def fit_multitask_mlp(train_data, task_types, **training_params):
  models = {}
  # Follows convention from process_datasets that the data for multitask models
  # is grouped under key "all"
  (_, X_train, y_train, W_train) = train_data["all"]
  X_train = train_data["features"]
  (y_train, W_train) = train_data["all"]
  models["all"] = train_multitask_model(X_train, y_train, W_train, task_types,
                                        **training_params)
  return models
@@ -36,26 +35,33 @@ def fit_singletask_mlp(train_data, task_types, **training_params):
  Perform stochastic gradient descent optimization for a keras MLP.

  task_types: dict
    dict mapping target names to output type. Each output type must be either
    dict mapping task names to output type. Each output type must be either
    "classification" or "regression".
  output_transforms: dict
    dict mapping target names to label transform. Each output type must be either
    dict mapping task names to label transform. Each output type must be either
    None or "log". Only for regression outputs.
  training_params: dict
    Aggregates keyword parameters to pass to train_multitask_model
  """
  models = {}
  for index, target in enumerate(sorted(train_data.keys())):
  train_ids = train_data["mol_ids"]
  X_train = train_data["features"]
  sorted_tasks = train_data["sorted_tasks"]
  for index, task in enumerate(sorted_tasks):
    print "Training model %d" % index
    print "Target %s" % target
    (train_ids, X_train, y_train, W_train) = train_data[target]
    print "Target %s" % task
    (y_train, W_train) = train_data[task]
    flat_W_train = W_train.ravel()
    task_X_train = X_train[flat_W_train.nonzero()]
    task_y_train = y_train[flat_W_train.nonzero()]
    print "%d compounds in Train" % len(train_ids)
    models[target] = train_multitask_model(X_train, y_train, W_train,
        {target: task_types[target]}, **training_params)
    models[task] = train_multitask_model(task_X_train, task_y_train, W_train,
                                         {task: task_types[task]},
                                         **training_params)
  return models

def train_multitask_model(X, y, W, task_types,
  learning_rate=0.01, decay=1e-6, momentum=0.9, nesterov=True, activation="relu",
def train_multitask_model(X, y, W, task_types, learning_rate=0.01,
                          decay=1e-6, momentum=0.9, nesterov=True, activation="relu",
                          dropout=0.5, nb_epoch=20, batch_size=50, n_hidden=500,
                          validation_split=0.1):
  """
@@ -71,7 +77,7 @@ def train_multitask_model(X, y, W, task_types,
  W: np.ndarray
    Weight matrix
  task_types: dict
    dict mapping target names to output type. Each output type must be either
    dict mapping task names to output type. Each output type must be either
    "classification" or "regression".
  learning_rate: float
    Learning rate used.
@@ -85,43 +91,48 @@ def train_multitask_model(X, y, W, task_types,
    maximal number of epochs to run the optimizer
  """
  eps = .001
  num_tasks = len(task_types)
  sorted_targets = sorted(task_types.keys())
  sorted_tasks = sorted(task_types.keys())
  local_task_types = task_types.copy()
  endpoints = sorted_targets
  (_, n_inputs) = np.shape(X)
  endpoints = sorted_tasks
  print "train_multitask_model()"
  print "np.shape(X)"
  print np.shape(X)
  n_inputs = len(X[0].flatten())
  # Add eps weight to avoid minibatches with zero weight (causes theano to crash).
  W = W + eps * np.ones(np.shape(W))
  print "np.shape(W)"
  print np.shape(W)
  model = Graph()
  model.add_input(name="input", ndim=n_inputs)
  #model.add_input(name="input", ndim=n_inputs)
  model.add_input(name="input", input_shape=(n_inputs,))
  model.add_node(
      Dense(n_inputs, n_hidden, init='uniform', activation=activation),
      Dense(n_hidden, init='uniform', activation=activation),
      name="dense", input="input")
  model.add_node(Dropout(dropout), name="dropout", input="dense")
  top_layer = "dropout"
  for task, target in enumerate(endpoints):
    task_type = local_task_types[target]
  for ind, task in enumerate(endpoints):
    task_type = local_task_types[task]
    if task_type == "classification":
      model.add_node(
          Dense(n_hidden, 2, init='uniform', activation="softmax"),
          name="dense_head%d" % task, input=top_layer)
          Dense(2, init='uniform', activation="softmax"),
          name="dense_head%d" % ind, input=top_layer)
    elif task_type == "regression":
      model.add_node(
          Dense(n_hidden, 1, init='uniform'),
          name="dense_head%d" % task, input=top_layer)
    model.add_output(name="task%d" % task, input="dense_head%d" % task)
          Dense(1, init='uniform'),
          name="dense_head%d" % ind, input=top_layer)
    model.add_output(name="task%d" % ind, input="dense_head%d" % ind)
  data_dict, loss_dict, sample_weights = {}, {}, {}
  data_dict["input"] = X
  for task, target in enumerate(endpoints):
    task_type = local_task_types[target]
    taskname = "task%d" % task
    sample_weights[taskname] = W[:, task]
  for ind, task in enumerate(endpoints):
    task_type = local_task_types[task]
    taskname = "task%d" % ind
    sample_weights[taskname] = W[:, ind]
    if task_type == "classification":
      loss_dict[taskname] = "binary_crossentropy"
      data_dict[taskname] = to_one_hot(y[:,task])
      data_dict[taskname] = to_one_hot(y[:, ind])
    elif task_type == "regression":
      loss_dict[taskname] = "mean_squared_error"
      data_dict[taskname] = y[:,task]
      data_dict[taskname] = y[:, ind]
  sgd = SGD(lr=learning_rate, decay=decay, momentum=momentum, nesterov=nesterov)
  print "About to compile model!"
  model.compile(optimizer=sgd, loss=loss_dict)
+21 −18
Original line number Diff line number Diff line
@@ -7,17 +7,22 @@ from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution3D, MaxPooling3D

def fit_3D_convolution(per_task_data, task_types, **training_params):
def fit_3D_convolution(train_data, **training_params):
  """
  Perform stochastic gradient descent for a 3D CNN.
  """
  models = {}
  (_, X_train, y_train, _), _ = per_task_data["all"]
  nb_classes = 2
  models["all"] = train_3D_convolution(X_train, y_train, **training_params)
  X_train = train_data["features"]
  if len(train_data["sorted_tasks"]) > 1:
    raise ValueError("3D Convolutions only supported for singletask.")
  task_name = train_data["sorted_tasks"][0]
  (y_train, _) = train_data["sorted_tasks"].itervalues().next()
  models[task_name] = train_3D_convolution(X_train, y_train, **training_params)
  return models

def train_3D_convolution(X, y, batch_size=50, nb_epoch=1):
def train_3D_convolution(X, y, batch_size=50, nb_epoch=1, learning_rate=0.01,
                         loss_function="mean_squared_error"):

  """
  Fit a keras 3D CNN to datat.

@@ -33,8 +38,6 @@ def train_3D_convolution(X, y, batch_size=50, nb_epoch=1):
  (n_samples, axis_length, _, _, n_channels) = np.shape(X)
  X = np.reshape(X, (n_samples, axis_length, n_channels, axis_length, axis_length))
  print "Final shape of X: " + str(np.shape(X))
  # Number of classes for classification
  nb_classes = 2

  # number of convolutional filters to use at each layer
  nb_filters = [axis_length/2, axis_length, axis_length]
@@ -47,8 +50,8 @@ def train_3D_convolution(X, y, batch_size=50, nb_epoch=1):

  model = Sequential()
  model.add(Convolution3D(nb_filter=nb_filters[0], stack_size=n_channels,
     nb_row=nb_conv[0], nb_col=nb_conv[0], nb_depth=nb_conv[0],
     border_mode='valid'))
                          nb_row=nb_conv[0], nb_col=nb_conv[0],
                          nb_depth=nb_conv[0], border_mode='valid'))
  model.add(Activation('relu'))
  model.add(MaxPooling3D(poolsize=(nb_pool[0], nb_pool[0], nb_pool[0])))
  model.add(Convolution3D(nb_filter=nb_filters[1], stack_size=nb_filters[0],
@@ -57,22 +60,22 @@ def train_3D_convolution(X, y, batch_size=50, nb_epoch=1):
  model.add(Activation('relu'))
  model.add(MaxPooling3D(poolsize=(nb_pool[1], nb_pool[1], nb_pool[1])))
  model.add(Convolution3D(nb_filter=nb_filters[2], stack_size=nb_filters[1],
     nb_row=nb_conv[2], nb_col=nb_conv[2], nb_depth=nb_conv[2],
     border_mode='valid'))
                          nb_row=nb_conv[2], nb_col=nb_conv[2],
                          nb_depth=nb_conv[2], border_mode='valid'))
  model.add(Activation('relu'))
  model.add(MaxPooling3D(poolsize=(nb_pool[2], nb_pool[2], nb_pool[2])))
  model.add(Flatten())
  # TODO(rbharath): If we change away from axis-size 32, this code will break.
  # Eventually figure out a more general rule that works for all axis sizes.
  model.add(Dense(32, 32/2, init='normal'))
  model.add(Dense(32/2, init='normal'))
  model.add(Activation('relu'))
  model.add(Dropout(0.5))
  # TODO(rbharath): Generalize this to support classification as well as regression.
  model.add(Dense(32/2, 1, init='normal'))
  model.add(Dense(1, init='normal'))

  sgd = RMSprop(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
  sgd = RMSprop(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True)
  print "About to compile model"
  model.compile(loss='mean_squared_error', optimizer=sgd)
  model.compile(loss=loss_function, optimizer=sgd)
  print "About to fit data to model."
  model.fit(X, y, batch_size=batch_size, nb_epoch=nb_epoch)
  return model
+12 −17
Original line number Diff line number Diff line
@@ -24,13 +24,19 @@ def fit_singletask_models(train_data, modeltype):
  seed: int (optional)
    Seed to initialize np.random.
  output_transforms: dict
    dict mapping target names to label transform. Each output type must be either
    dict mapping task names to label transform. Each output type must be either
    None or "log". Only for regression outputs.
  """
  models = {}
  for target in sorted(train_data.keys()):
    print "Building model for target %s" % target
    (_, X_train, y_train, _) = train_data[target]
  import numpy as np
  X_train = train_data["features"]
  sorted_tasks = train_data["sorted_tasks"]
  for task in sorted_tasks:
    print "Building model for task %s" % task
    (y_train, W_train) = train_data[task]
    W_train = W_train.ravel()
    task_X_train = X_train[W_train.nonzero()]
    task_y_train = y_train[W_train.nonzero()]
    if modeltype == "rf_regressor":
      model = RandomForestRegressor(
          n_estimators=500, n_jobs=-1, warm_start=True, max_features="sqrt")
@@ -51,17 +57,6 @@ def fit_singletask_models(train_data, modeltype):
      model = ElasticNetCV(max_iter=2000, n_jobs=-1)
    else:
      raise ValueError("Invalid model type provided.")
    model.fit(X_train, y_train.ravel())
    models[target] = model
    model.fit(task_X_train, task_y_train.ravel())
    models[task] = model
  return models

# TODO(rbharath): I believe this is broken. Update it to work with the rest of
# the package.
def fit_multitask_rf(train_data):
  """Fits a multitask RF model to provided dataset.
  """
  (_, X_train, y_train, _) = train_data
  model = RandomForestClassifier(
      n_estimators=100, n_jobs=-1, class_weight="auto")
  model.fit(X_train, y_train)
  return model
+94 −88

File changed.

Preview size limit exceeded, changes collapsed.

deep_chem/utils/analysis.py

deleted100644 → 0
+0 −128
Original line number Diff line number Diff line
"""
Utility functions to compare datasets to one another.
"""
__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2015, Stanford University"
__license__ = "LGPL"

import numpy as np

def results_to_csv(results_dict):
  """Pretty prints results as CSV line."""
  targets = sorted(results_dict.keys())
  print ",".join(targets)
  print ",".join([str(results_dict[target]) for target in targets])

def summarize_distribution(y):
  """Analyzes regression dataset.

  Parameters
  ----------
  y: np.ndarray 
    A 1D numpy array containing distribution.
  """
  mean = np.mean(y)
  std = np.std(y)
  minval = np.amin(y)
  maxval = np.amax(y)
  hist = np.histogram(y)
  print "Mean: %f" % mean
  print "Std: %f" % std
  print "Min: %f" % minval
  print "Max: %f" % maxval
  print "Histogram: "
  print hist

def analyze_data(dataset, splittype="random"):
  """Analyzes regression dataset.

  Parameters
  ----------
  dataset: dict
    A dictionary of type produced by load_datasets.
  splittype: string
    Type of split for train/test. Either random or scaffold.
  """
  singletask = multitask_to_singletask(dataset)
  for target in singletask:
    data = singletask[target]
    if len(data.keys()) == 0:
      continue
    if splittype == "random":
      train, test = train_test_random_split(data, seed=0)
    elif splittype == "scaffold":
      train, test = train_test_scaffold_split(data)
    else:
      raise ValueError("Improper splittype. Must be random/scaffold.")
    _, Xtrain, ytrain, _ = dataset_to_numpy(train)
    # TODO(rbharath): Take this out once debugging is completed
    ytrain = np.log(ytrain)
    mean = np.mean(ytrain)
    std = np.std(ytrain)
    minval = np.amin(ytrain)
    maxval = np.amax(ytrain)
    hist = np.histogram(ytrain)
    print target
    print "Mean: %f" % mean
    print "Std: %f" % std
    print "Min: %f" % minval
    print "Max: %f" % maxval
    print "Histogram: "
    print hist


def compare_all_datasets():
  """Compare all datasets in our collection.

  TODO(rbharath): Make this actually robust.
  """
  muv_path = "/home/rbharath/vs-datasets/muv"
  pcba_path = "/home/rbharath/vs-datasets/pcba"
  dude_path = "/home/rbharath/vs-datasets/dude"
  pfizer_path = "/home/rbharath/private-datasets/pfizer"
  muv_data = load_datasets([muv_path])
  pcba_data = load_datasets([pcba_path])
  dude_data = load_datasets([dude_path])
  pfizer_data = load_datasets([pfizer_path])
  print "----------------------"
  compare_datasets("muv", muv_data, "pcba", pcba_data)
  print "----------------------"
  compare_datasets("pfizer", pfizer_data, "muv", muv_data)
  print "----------------------"
  compare_datasets("pfizer", pfizer_data, "pcba", pcba_data)
  print "----------------------"
  compare_datasets("muv", muv_data, "dude", dude_data)
  print "----------------------"
  compare_datasets("pcba", pcba_data, "dude", dude_data)
  print "----------------------"
  compare_datasets("pfizer", pfizer_data, "dude", dude_data)

def compare_datasets(first_name, first, second_name, second):
  """Counts the overlap between two provided datasets.

  Parameters
  ----------
  first_name: string
    Name of first dataset
  first: dict
    Data dictionary generated by load_datasets.
  second_name: string
    Name of second dataset
  second: dict
    Data dictionary generated by load_datasets.
  """
  first_scaffolds = set()
  for key in first:
    _, scaffold, _ = first[key]
    first_scaffolds.add(scaffold)
  print "%d molecules in %s dataset" % (len(first), first_name)
  print "%d scaffolds in %s dataset" % (len(first_scaffolds), first_name)
  second_scaffolds = set()
  for key in second:
    _, scaffold, _ = second[key]
    second_scaffolds.add(scaffold)
  print "%d molecules in %s dataset" % (len(second), second_name)
  print "%d scaffolds in %s dataset" % (len(second_scaffolds), second_name)
  common_scaffolds = first_scaffolds.intersection(second_scaffolds)
  print "%d scaffolds in both" % len(common_scaffolds)
Loading