Commit 6d4f6df0 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Merge pull request #32 from rbharath/multitask

Multitask Training with modeler
parents 63ada38a ee5cccc0
Loading
Loading
Loading
Loading
+53 −42
Original line number Original line Diff line number Diff line
@@ -2,10 +2,8 @@
Code for processing the Google vs-datasets using keras.
Code for processing the Google vs-datasets using keras.
"""
"""
import numpy as np
import numpy as np
import keras
from keras.models import Graph
from keras.models import Graph
from keras.models import Sequential
from keras.layers.core import Dense, Dropout
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD
from keras.optimizers import SGD
from deep_chem.utils.preprocess import to_one_hot
from deep_chem.utils.preprocess import to_one_hot


@@ -18,7 +16,7 @@ def fit_multitask_mlp(train_data, task_types, **training_params):
  Parameters
  Parameters
  ----------
  ----------
  task_types: dict
  task_types: dict
    dict mapping target names to output type. Each output type must be either
    dict mapping task names to output type. Each output type must be either
    "classification" or "regression".
    "classification" or "regression".
  training_params: dict
  training_params: dict
    Aggregates keyword parameters to pass to train_multitask_model
    Aggregates keyword parameters to pass to train_multitask_model
@@ -26,7 +24,8 @@ def fit_multitask_mlp(train_data, task_types, **training_params):
  models = {}
  models = {}
  # Follows convention from process_datasets that the data for multitask models
  # Follows convention from process_datasets that the data for multitask models
  # is grouped under key "all"
  # is grouped under key "all"
  (_, X_train, y_train, W_train) = train_data["all"]
  X_train = train_data["features"]
  (y_train, W_train) = train_data["all"]
  models["all"] = train_multitask_model(X_train, y_train, W_train, task_types,
  models["all"] = train_multitask_model(X_train, y_train, W_train, task_types,
                                        **training_params)
                                        **training_params)
  return models
  return models
@@ -36,26 +35,33 @@ def fit_singletask_mlp(train_data, task_types, **training_params):
  Perform stochastic gradient descent optimization for a keras MLP.
  Perform stochastic gradient descent optimization for a keras MLP.


  task_types: dict
  task_types: dict
    dict mapping target names to output type. Each output type must be either
    dict mapping task names to output type. Each output type must be either
    "classification" or "regression".
    "classification" or "regression".
  output_transforms: dict
  output_transforms: dict
    dict mapping target names to label transform. Each output type must be either
    dict mapping task names to label transform. Each output type must be either
    None or "log". Only for regression outputs.
    None or "log". Only for regression outputs.
  training_params: dict
  training_params: dict
    Aggregates keyword parameters to pass to train_multitask_model
    Aggregates keyword parameters to pass to train_multitask_model
  """
  """
  models = {}
  models = {}
  for index, target in enumerate(sorted(train_data.keys())):
  train_ids = train_data["mol_ids"]
  X_train = train_data["features"]
  sorted_tasks = train_data["sorted_tasks"]
  for index, task in enumerate(sorted_tasks):
    print "Training model %d" % index
    print "Training model %d" % index
    print "Target %s" % target
    print "Target %s" % task
    (train_ids, X_train, y_train, W_train) = train_data[target]
    (y_train, W_train) = train_data[task]
    flat_W_train = W_train.ravel()
    task_X_train = X_train[flat_W_train.nonzero()]
    task_y_train = y_train[flat_W_train.nonzero()]
    print "%d compounds in Train" % len(train_ids)
    print "%d compounds in Train" % len(train_ids)
    models[target] = train_multitask_model(X_train, y_train, W_train,
    models[task] = train_multitask_model(task_X_train, task_y_train, W_train,
        {target: task_types[target]}, **training_params)
                                         {task: task_types[task]},
                                         **training_params)
  return models
  return models


def train_multitask_model(X, y, W, task_types,
def train_multitask_model(X, y, W, task_types, learning_rate=0.01,
  learning_rate=0.01, decay=1e-6, momentum=0.9, nesterov=True, activation="relu",
                          decay=1e-6, momentum=0.9, nesterov=True, activation="relu",
                          dropout=0.5, nb_epoch=20, batch_size=50, n_hidden=500,
                          dropout=0.5, nb_epoch=20, batch_size=50, n_hidden=500,
                          validation_split=0.1):
                          validation_split=0.1):
  """
  """
@@ -71,7 +77,7 @@ def train_multitask_model(X, y, W, task_types,
  W: np.ndarray
  W: np.ndarray
    Weight matrix
    Weight matrix
  task_types: dict
  task_types: dict
    dict mapping target names to output type. Each output type must be either
    dict mapping task names to output type. Each output type must be either
    "classification" or "regression".
    "classification" or "regression".
  learning_rate: float
  learning_rate: float
    Learning rate used.
    Learning rate used.
@@ -85,43 +91,48 @@ def train_multitask_model(X, y, W, task_types,
    maximal number of epochs to run the optimizer
    maximal number of epochs to run the optimizer
  """
  """
  eps = .001
  eps = .001
  num_tasks = len(task_types)
  sorted_tasks = sorted(task_types.keys())
  sorted_targets = sorted(task_types.keys())
  local_task_types = task_types.copy()
  local_task_types = task_types.copy()
  endpoints = sorted_targets
  endpoints = sorted_tasks
  (_, n_inputs) = np.shape(X)
  print "train_multitask_model()"
  print "np.shape(X)"
  print np.shape(X)
  n_inputs = len(X[0].flatten())
  # Add eps weight to avoid minibatches with zero weight (causes theano to crash).
  # Add eps weight to avoid minibatches with zero weight (causes theano to crash).
  W = W + eps * np.ones(np.shape(W))
  W = W + eps * np.ones(np.shape(W))
  print "np.shape(W)"
  print np.shape(W)
  model = Graph()
  model = Graph()
  model.add_input(name="input", ndim=n_inputs)
  #model.add_input(name="input", ndim=n_inputs)
  model.add_input(name="input", input_shape=(n_inputs,))
  model.add_node(
  model.add_node(
      Dense(n_inputs, n_hidden, init='uniform', activation=activation),
      Dense(n_hidden, init='uniform', activation=activation),
      name="dense", input="input")
      name="dense", input="input")
  model.add_node(Dropout(dropout), name="dropout", input="dense")
  model.add_node(Dropout(dropout), name="dropout", input="dense")
  top_layer = "dropout"
  top_layer = "dropout"
  for task, target in enumerate(endpoints):
  for ind, task in enumerate(endpoints):
    task_type = local_task_types[target]
    task_type = local_task_types[task]
    if task_type == "classification":
    if task_type == "classification":
      model.add_node(
      model.add_node(
          Dense(n_hidden, 2, init='uniform', activation="softmax"),
          Dense(2, init='uniform', activation="softmax"),
          name="dense_head%d" % task, input=top_layer)
          name="dense_head%d" % ind, input=top_layer)
    elif task_type == "regression":
    elif task_type == "regression":
      model.add_node(
      model.add_node(
          Dense(n_hidden, 1, init='uniform'),
          Dense(1, init='uniform'),
          name="dense_head%d" % task, input=top_layer)
          name="dense_head%d" % ind, input=top_layer)
    model.add_output(name="task%d" % task, input="dense_head%d" % task)
    model.add_output(name="task%d" % ind, input="dense_head%d" % ind)
  data_dict, loss_dict, sample_weights = {}, {}, {}
  data_dict, loss_dict, sample_weights = {}, {}, {}
  data_dict["input"] = X
  data_dict["input"] = X
  for task, target in enumerate(endpoints):
  for ind, task in enumerate(endpoints):
    task_type = local_task_types[target]
    task_type = local_task_types[task]
    taskname = "task%d" % task
    taskname = "task%d" % ind
    sample_weights[taskname] = W[:, task]
    sample_weights[taskname] = W[:, ind]
    if task_type == "classification":
    if task_type == "classification":
      loss_dict[taskname] = "binary_crossentropy"
      loss_dict[taskname] = "binary_crossentropy"
      data_dict[taskname] = to_one_hot(y[:,task])
      data_dict[taskname] = to_one_hot(y[:, ind])
    elif task_type == "regression":
    elif task_type == "regression":
      loss_dict[taskname] = "mean_squared_error"
      loss_dict[taskname] = "mean_squared_error"
      data_dict[taskname] = y[:,task]
      data_dict[taskname] = y[:, ind]
  sgd = SGD(lr=learning_rate, decay=decay, momentum=momentum, nesterov=nesterov)
  sgd = SGD(lr=learning_rate, decay=decay, momentum=momentum, nesterov=nesterov)
  print "About to compile model!"
  print "About to compile model!"
  model.compile(optimizer=sgd, loss=loss_dict)
  model.compile(optimizer=sgd, loss=loss_dict)
+21 −18
Original line number Original line Diff line number Diff line
@@ -7,17 +7,22 @@ from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution3D, MaxPooling3D
from keras.layers.convolutional import Convolution3D, MaxPooling3D


def fit_3D_convolution(per_task_data, task_types, **training_params):
def fit_3D_convolution(train_data, **training_params):
  """
  """
  Perform stochastic gradient descent for a 3D CNN.
  Perform stochastic gradient descent for a 3D CNN.
  """
  """
  models = {}
  models = {}
  (_, X_train, y_train, _), _ = per_task_data["all"]
  X_train = train_data["features"]
  nb_classes = 2
  if len(train_data["sorted_tasks"]) > 1:
  models["all"] = train_3D_convolution(X_train, y_train, **training_params)
    raise ValueError("3D Convolutions only supported for singletask.")
  task_name = train_data["sorted_tasks"][0]
  (y_train, _) = train_data["sorted_tasks"].itervalues().next()
  models[task_name] = train_3D_convolution(X_train, y_train, **training_params)
  return models
  return models


def train_3D_convolution(X, y, batch_size=50, nb_epoch=1):
def train_3D_convolution(X, y, batch_size=50, nb_epoch=1, learning_rate=0.01,
                         loss_function="mean_squared_error"):

  """
  """
  Fit a keras 3D CNN to datat.
  Fit a keras 3D CNN to datat.


@@ -33,8 +38,6 @@ def train_3D_convolution(X, y, batch_size=50, nb_epoch=1):
  (n_samples, axis_length, _, _, n_channels) = np.shape(X)
  (n_samples, axis_length, _, _, n_channels) = np.shape(X)
  X = np.reshape(X, (n_samples, axis_length, n_channels, axis_length, axis_length))
  X = np.reshape(X, (n_samples, axis_length, n_channels, axis_length, axis_length))
  print "Final shape of X: " + str(np.shape(X))
  print "Final shape of X: " + str(np.shape(X))
  # Number of classes for classification
  nb_classes = 2


  # number of convolutional filters to use at each layer
  # number of convolutional filters to use at each layer
  nb_filters = [axis_length/2, axis_length, axis_length]
  nb_filters = [axis_length/2, axis_length, axis_length]
@@ -47,8 +50,8 @@ def train_3D_convolution(X, y, batch_size=50, nb_epoch=1):


  model = Sequential()
  model = Sequential()
  model.add(Convolution3D(nb_filter=nb_filters[0], stack_size=n_channels,
  model.add(Convolution3D(nb_filter=nb_filters[0], stack_size=n_channels,
     nb_row=nb_conv[0], nb_col=nb_conv[0], nb_depth=nb_conv[0],
                          nb_row=nb_conv[0], nb_col=nb_conv[0],
     border_mode='valid'))
                          nb_depth=nb_conv[0], border_mode='valid'))
  model.add(Activation('relu'))
  model.add(Activation('relu'))
  model.add(MaxPooling3D(poolsize=(nb_pool[0], nb_pool[0], nb_pool[0])))
  model.add(MaxPooling3D(poolsize=(nb_pool[0], nb_pool[0], nb_pool[0])))
  model.add(Convolution3D(nb_filter=nb_filters[1], stack_size=nb_filters[0],
  model.add(Convolution3D(nb_filter=nb_filters[1], stack_size=nb_filters[0],
@@ -57,22 +60,22 @@ def train_3D_convolution(X, y, batch_size=50, nb_epoch=1):
  model.add(Activation('relu'))
  model.add(Activation('relu'))
  model.add(MaxPooling3D(poolsize=(nb_pool[1], nb_pool[1], nb_pool[1])))
  model.add(MaxPooling3D(poolsize=(nb_pool[1], nb_pool[1], nb_pool[1])))
  model.add(Convolution3D(nb_filter=nb_filters[2], stack_size=nb_filters[1],
  model.add(Convolution3D(nb_filter=nb_filters[2], stack_size=nb_filters[1],
     nb_row=nb_conv[2], nb_col=nb_conv[2], nb_depth=nb_conv[2],
                          nb_row=nb_conv[2], nb_col=nb_conv[2],
     border_mode='valid'))
                          nb_depth=nb_conv[2], border_mode='valid'))
  model.add(Activation('relu'))
  model.add(Activation('relu'))
  model.add(MaxPooling3D(poolsize=(nb_pool[2], nb_pool[2], nb_pool[2])))
  model.add(MaxPooling3D(poolsize=(nb_pool[2], nb_pool[2], nb_pool[2])))
  model.add(Flatten())
  model.add(Flatten())
  # TODO(rbharath): If we change away from axis-size 32, this code will break.
  # TODO(rbharath): If we change away from axis-size 32, this code will break.
  # Eventually figure out a more general rule that works for all axis sizes.
  # Eventually figure out a more general rule that works for all axis sizes.
  model.add(Dense(32, 32/2, init='normal'))
  model.add(Dense(32/2, init='normal'))
  model.add(Activation('relu'))
  model.add(Activation('relu'))
  model.add(Dropout(0.5))
  model.add(Dropout(0.5))
  # TODO(rbharath): Generalize this to support classification as well as regression.
  # TODO(rbharath): Generalize this to support classification as well as regression.
  model.add(Dense(32/2, 1, init='normal'))
  model.add(Dense(1, init='normal'))


  sgd = RMSprop(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
  sgd = RMSprop(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True)
  print "About to compile model"
  print "About to compile model"
  model.compile(loss='mean_squared_error', optimizer=sgd)
  model.compile(loss=loss_function, optimizer=sgd)
  print "About to fit data to model."
  print "About to fit data to model."
  model.fit(X, y, batch_size=batch_size, nb_epoch=nb_epoch)
  model.fit(X, y, batch_size=batch_size, nb_epoch=nb_epoch)
  return model
  return model
+12 −17
Original line number Original line Diff line number Diff line
@@ -24,13 +24,19 @@ def fit_singletask_models(train_data, modeltype):
  seed: int (optional)
  seed: int (optional)
    Seed to initialize np.random.
    Seed to initialize np.random.
  output_transforms: dict
  output_transforms: dict
    dict mapping target names to label transform. Each output type must be either
    dict mapping task names to label transform. Each output type must be either
    None or "log". Only for regression outputs.
    None or "log". Only for regression outputs.
  """
  """
  models = {}
  models = {}
  for target in sorted(train_data.keys()):
  import numpy as np
    print "Building model for target %s" % target
  X_train = train_data["features"]
    (_, X_train, y_train, _) = train_data[target]
  sorted_tasks = train_data["sorted_tasks"]
  for task in sorted_tasks:
    print "Building model for task %s" % task
    (y_train, W_train) = train_data[task]
    W_train = W_train.ravel()
    task_X_train = X_train[W_train.nonzero()]
    task_y_train = y_train[W_train.nonzero()]
    if modeltype == "rf_regressor":
    if modeltype == "rf_regressor":
      model = RandomForestRegressor(
      model = RandomForestRegressor(
          n_estimators=500, n_jobs=-1, warm_start=True, max_features="sqrt")
          n_estimators=500, n_jobs=-1, warm_start=True, max_features="sqrt")
@@ -51,17 +57,6 @@ def fit_singletask_models(train_data, modeltype):
      model = ElasticNetCV(max_iter=2000, n_jobs=-1)
      model = ElasticNetCV(max_iter=2000, n_jobs=-1)
    else:
    else:
      raise ValueError("Invalid model type provided.")
      raise ValueError("Invalid model type provided.")
    model.fit(X_train, y_train.ravel())
    model.fit(task_X_train, task_y_train.ravel())
    models[target] = model
    models[task] = model
  return models
  return models

# TODO(rbharath): I believe this is broken. Update it to work with the rest of
# the package.
def fit_multitask_rf(train_data):
  """Fits a multitask RF model to provided dataset.
  """
  (_, X_train, y_train, _) = train_data
  model = RandomForestClassifier(
      n_estimators=100, n_jobs=-1, class_weight="auto")
  model.fit(X_train, y_train)
  return model
+94 −88

File changed.

Preview size limit exceeded, changes collapsed.

deep_chem/utils/analysis.py

deleted100644 → 0
+0 −128
Original line number Original line Diff line number Diff line
"""
Utility functions to compare datasets to one another.
"""
__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2015, Stanford University"
__license__ = "LGPL"

import numpy as np

def results_to_csv(results_dict):
  """Pretty prints results as CSV line."""
  targets = sorted(results_dict.keys())
  print ",".join(targets)
  print ",".join([str(results_dict[target]) for target in targets])

def summarize_distribution(y):
  """Analyzes regression dataset.

  Parameters
  ----------
  y: np.ndarray 
    A 1D numpy array containing distribution.
  """
  mean = np.mean(y)
  std = np.std(y)
  minval = np.amin(y)
  maxval = np.amax(y)
  hist = np.histogram(y)
  print "Mean: %f" % mean
  print "Std: %f" % std
  print "Min: %f" % minval
  print "Max: %f" % maxval
  print "Histogram: "
  print hist

def analyze_data(dataset, splittype="random"):
  """Analyzes regression dataset.

  Parameters
  ----------
  dataset: dict
    A dictionary of type produced by load_datasets.
  splittype: string
    Type of split for train/test. Either random or scaffold.
  """
  singletask = multitask_to_singletask(dataset)
  for target in singletask:
    data = singletask[target]
    if len(data.keys()) == 0:
      continue
    if splittype == "random":
      train, test = train_test_random_split(data, seed=0)
    elif splittype == "scaffold":
      train, test = train_test_scaffold_split(data)
    else:
      raise ValueError("Improper splittype. Must be random/scaffold.")
    _, Xtrain, ytrain, _ = dataset_to_numpy(train)
    # TODO(rbharath): Take this out once debugging is completed
    ytrain = np.log(ytrain)
    mean = np.mean(ytrain)
    std = np.std(ytrain)
    minval = np.amin(ytrain)
    maxval = np.amax(ytrain)
    hist = np.histogram(ytrain)
    print target
    print "Mean: %f" % mean
    print "Std: %f" % std
    print "Min: %f" % minval
    print "Max: %f" % maxval
    print "Histogram: "
    print hist


def compare_all_datasets():
  """Compare all datasets in our collection.

  TODO(rbharath): Make this actually robust.
  """
  muv_path = "/home/rbharath/vs-datasets/muv"
  pcba_path = "/home/rbharath/vs-datasets/pcba"
  dude_path = "/home/rbharath/vs-datasets/dude"
  pfizer_path = "/home/rbharath/private-datasets/pfizer"
  muv_data = load_datasets([muv_path])
  pcba_data = load_datasets([pcba_path])
  dude_data = load_datasets([dude_path])
  pfizer_data = load_datasets([pfizer_path])
  print "----------------------"
  compare_datasets("muv", muv_data, "pcba", pcba_data)
  print "----------------------"
  compare_datasets("pfizer", pfizer_data, "muv", muv_data)
  print "----------------------"
  compare_datasets("pfizer", pfizer_data, "pcba", pcba_data)
  print "----------------------"
  compare_datasets("muv", muv_data, "dude", dude_data)
  print "----------------------"
  compare_datasets("pcba", pcba_data, "dude", dude_data)
  print "----------------------"
  compare_datasets("pfizer", pfizer_data, "dude", dude_data)

def compare_datasets(first_name, first, second_name, second):
  """Counts the overlap between two provided datasets.

  Parameters
  ----------
  first_name: string
    Name of first dataset
  first: dict
    Data dictionary generated by load_datasets.
  second_name: string
    Name of second dataset
  second: dict
    Data dictionary generated by load_datasets.
  """
  first_scaffolds = set()
  for key in first:
    _, scaffold, _ = first[key]
    first_scaffolds.add(scaffold)
  print "%d molecules in %s dataset" % (len(first), first_name)
  print "%d scaffolds in %s dataset" % (len(first_scaffolds), first_name)
  second_scaffolds = set()
  for key in second:
    _, scaffold, _ = second[key]
    second_scaffolds.add(scaffold)
  print "%d molecules in %s dataset" % (len(second), second_name)
  print "%d scaffolds in %s dataset" % (len(second_scaffolds), second_name)
  common_scaffolds = first_scaffolds.intersection(second_scaffolds)
  print "%d scaffolds in both" % len(common_scaffolds)
Loading