Commit 415033d0 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Merge pull request #35 from evanfeinberg/master

Working with Data Sets exceeding RAM 
parents 6d4f6df0 34925328
Loading
Loading
Loading
Loading
+101 −0
Original line number Diff line number Diff line
"""
Contains an abstract base class that supports different ML models.
"""

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

#TODO(enf/rbharath): incorporate save, load, eval, fit features into class Model.
class Model(object):
  """
  Abstract base class for different ML models.
  """
  def __init__(self, task_types, model_params, initialize_raw_model=True):
    self.task_types = task_types
    self.model_params = model_params

  def fit_on_batch(self, X, y, w):
    """
    Updates existing model with new information.
    """
    raise NotImplementedError(
        "Each model is responsible for its own fit_on_batch method.")

  def predict_on_batch(self, X):
    """
    Makes predictions on given batch of new data.
    """
    raise NotImplementedError(
        "Each model is responsible for its own predict_on_batch method.")    

  def set_raw_model(self, raw_model):
    """
    Set underlying raw model. Useful when loading from disk.
    """
    self.raw_model = raw_model

  def get_raw_model(self):
    """
    Return raw model.
    """
    return(self.raw_model)


'''
def model_predictions(X, model, n_targets, task_types, modeltype="sklearn"):
  """Obtains predictions of provided model on test_set.

  Returns an ndarray of shape (n_samples, n_targets)

  TODO(rbharath): This function uses n_targets instead of
  task_transforms like everything else.

  Parameters
  ----------
  X: numpy.ndarray
    Test set data.
  model: model.
    A trained scikit-learn or keras model.
  n_targets: int
    Number of output targets
  task_types: dict
    dict mapping target names to output type. Each output type must be either
    "classification" or "regression".
  modeltype: string
    Either sklearn, keras, or keras_multitask
  """
  # Extract features for test set and make preds
  # TODO(rbharath): This change in shape should not(!) be handled here. Make
  # an upstream change so the evaluator doesn't have to worry about this.
  if len(np.shape(X)) > 2:  # Dealing with 3D data
    if len(np.shape(X)) != 5:
      raise ValueError(
          "Tensorial datatype must be of shape (n_samples, N, N, N, n_channels).")
    (n_samples, axis_length, _, _, n_channels) = np.shape(X)
    X = np.reshape(X, (n_samples, axis_length, n_channels, axis_length, axis_length))
  if modeltype == "keras-graph":
    predictions = model.predict({"input": X})
    ypreds = []
    for index in range(n_targets):
      ypreds.append(predictions["task%d" % index])
  elif modeltype == "sklearn":
    # Must be single-task (breaking multitask RFs here)
    task_type = task_types.itervalues().next()
    if task_type == "classification":
      print("model_predictions()")
      print("np.shape(X)")
      print(np.shape(X))
      ypreds = model.predict_proba(X)
    elif task_type == "regression":
      ypreds = model.predict(X)
  elif modeltype == "keras-sequential":
    ypreds = model.predict(X)
  else:
    raise ValueError("Improper modeltype.")
  if isinstance(ypreds, np.ndarray):
    ypreds = np.squeeze(ypreds)
  if not isinstance(ypreds, list):
    ypreds = [ypreds]
  return ypreds
'''
+67 −3
Original line number Diff line number Diff line
@@ -5,8 +5,72 @@ import numpy as np
from keras.models import Graph
from keras.layers.core import Dense, Dropout
from keras.optimizers import SGD
from deep_chem.utils.preprocess import to_one_hot
from deep_chem.models import Model

#TODO(rbharath/enf): Make this real. It's a dummy now.
class SingleTaskDNN(Model):
  """
  Abstract base class for different ML models.
  """
  def __init__(self, task_types, model_params, initialize_raw_model=True):
    self.task_types = task_types
    self.model_params = model_params
    self.raw_model = None

  def fit_on_batch(self, X, y, w):
    """
    Updates existing model with new information.
    """
    raise NotImplementedError(
        "Each model is responsible for its own fit_on_batch method.")

  def predict_on_batch(self, X):
    """
    Makes predictions on given batch of new data.
    """
    raise NotImplementedError(
        "Each model is responsible for its own predict_on_batch method.")    

#TODO(rbharath/enf): Make this real. It's a dummy now.
class MultiTaskDNN(Model):
  """
  Abstract base class for different ML models.
  """
  def __init__(self, task_types, model_params, initialize_raw_model=True):
    self.task_types = task_types
    self.model_params = model_params
    self.raw_model = None

  def fit_on_batch(self, X, y, w):
    """
    Updates existing model with new information.
    """
    raise NotImplementedError(
        "Each model is responsible for its own fit_on_batch method.")

  def predict_on_batch(self, X):
    """
    Makes predictions on given batch of new data.
    """
    raise NotImplementedError(
        "Each model is responsible for its own predict_on_batch method.")   

def to_one_hot(y):
  """Transforms label vector into one-hot encoding.

  Turns y into vector of shape [n_samples, 2] (assuming binary labels).

  y: np.ndarray
    A vector of shape [n_samples, 1]
  """
  n_samples = np.shape(y)[0]
  y_hot = np.zeros((n_samples, 2))
  for index, val in enumerate(y):
    if val == 0:
      y_hot[index] = np.array([1, 0])
    elif val == 1:
      y_hot[index] = np.array([0, 1])
  return y_hot

def fit_multitask_mlp(train_data, task_types, **training_params):
  """
@@ -62,7 +126,7 @@ def fit_singletask_mlp(train_data, task_types, **training_params):

def train_multitask_model(X, y, W, task_types, learning_rate=0.01,
                          decay=1e-6, momentum=0.9, nesterov=True, activation="relu",
                          dropout=0.5, nb_epoch=20, batch_size=50, n_hidden=500,
                          dropout=0.5, nb_epoch=20, batch_size=50, nb_hidden=500,
                          validation_split=0.1):
  """
  Perform stochastic gradient descent optimization for a keras multitask MLP.
@@ -106,7 +170,7 @@ def train_multitask_model(X, y, W, task_types, learning_rate=0.01,
  #model.add_input(name="input", ndim=n_inputs)
  model.add_input(name="input", input_shape=(n_inputs,))
  model.add_node(
      Dense(n_hidden, init='uniform', activation=activation),
      Dense(nb_hidden, init='uniform', activation=activation),
      name="dense", input="input")
  model.add_node(Dropout(dropout), name="dropout", input="dense")
  top_layer = "dropout"
+72 −62
Original line number Diff line number Diff line
"""
Code for training 3D convolutions.
"""

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import numpy as np
from keras.optimizers import RMSprop
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution3D, MaxPooling3D
from deep_chem.models import Model

def fit_3D_convolution(train_data, **training_params):
  """
  Perform stochastic gradient descent for a 3D CNN.
  """
  models = {}
  X_train = train_data["features"]
  if len(train_data["sorted_tasks"]) > 1:
    raise ValueError("3D Convolutions only supported for singletask.")
  task_name = train_data["sorted_tasks"][0]
  (y_train, _) = train_data["sorted_tasks"].itervalues().next()
  models[task_name] = train_3D_convolution(X_train, y_train, **training_params)
  return models
def shuffle_shape(shape):
  (axis_length, _, _, n_channels) = shape
  shuffled_shape = (n_channels, axis_length, axis_length, axis_length)
  return shuffled_shape

def train_3D_convolution(X, y, batch_size=50, nb_epoch=1, learning_rate=0.01,
                         loss_function="mean_squared_error"):
def shuffle_data(X):
  (n_samples, axis_length, _, _, n_channels) = np.shape(X)
  X = np.reshape(X, (n_samples, n_channels, axis_length, axis_length, axis_length))
  return X

  """
  Fit a keras 3D CNN to datat.

  Parameters
  ----------
  nb_epoch: int
    maximal number of epochs to run the optimizer
class DockingDNN(Model):
  """
  print "Training 3D model"
  print "Original shape of X: " + str(np.shape(X))
  print "Shuffling X dimensions to match convnet"
  # TODO(rbharath): Modify the featurization so that it matches desired shaped.
  (n_samples, axis_length, _, _, n_channels) = np.shape(X)
  X = np.reshape(X, (n_samples, axis_length, n_channels, axis_length, axis_length))
  print "Final shape of X: " + str(np.shape(X))
  Wrapper class for fitting 3D convolutional networks for deep docking.
  """
  def __init__(self, task_types, model_params, initialize_raw_model=True):
    super(DockingDNN, self).__init__(task_types, model_params, initialize_raw_model)
    if initialize_raw_model:
      (axis_length, _, _, n_channels) = model_params["data_shape"]
      self.input_shape = (n_channels, 
                          axis_length, axis_length, axis_length)

      learning_rate = model_params["learning_rate"]
      loss_function = model_params["loss_function"]

         # number of convolutional filters to use at each layer
      nb_filters = [axis_length/2, axis_length, axis_length]
@@ -47,35 +46,46 @@ def train_3D_convolution(X, y, batch_size=50, nb_epoch=1, learning_rate=0.01,

      # level of convolution to perform at each layer (CONV x CONV)
      nb_conv = [7, 5, 3]

      model = Sequential()
  model.add(Convolution3D(nb_filter=nb_filters[0], stack_size=n_channels,

      model.add(Convolution3D(nb_filter=nb_filters[0], nb_depth=nb_conv[0], 
                              nb_row=nb_conv[0], nb_col=nb_conv[0],
                          nb_depth=nb_conv[0], border_mode='valid'))
                              input_shape=self.input_shape, border_mode="full"))
      model.add(Activation('relu'))
  model.add(MaxPooling3D(poolsize=(nb_pool[0], nb_pool[0], nb_pool[0])))
  model.add(Convolution3D(nb_filter=nb_filters[1], stack_size=nb_filters[0],
                          nb_row=nb_conv[1], nb_col=nb_conv[1], nb_depth=nb_conv[1],
                          border_mode='valid'))

      model.add(MaxPooling3D(pool_size=(nb_pool[0], nb_pool[0], nb_pool[0])))
      model.add(Convolution3D(nb_filter=nb_filters[1],  nb_depth=nb_conv[1],
                              nb_row=nb_conv[1], nb_col=nb_conv[1], border_mode="full"))
      model.add(Activation('relu'))
  model.add(MaxPooling3D(poolsize=(nb_pool[1], nb_pool[1], nb_pool[1])))
  model.add(Convolution3D(nb_filter=nb_filters[2], stack_size=nb_filters[1],
                          nb_row=nb_conv[2], nb_col=nb_conv[2],
                          nb_depth=nb_conv[2], border_mode='valid'))
      model.add(MaxPooling3D(pool_size=(nb_pool[1], nb_pool[1], nb_pool[1])))
      model.add(Convolution3D(nb_filter=nb_filters[2], nb_depth=nb_conv[2],
                              nb_row=nb_conv[2], nb_col=nb_conv[2], border_mode="full"))
      model.add(Activation('relu'))
  model.add(MaxPooling3D(poolsize=(nb_pool[2], nb_pool[2], nb_pool[2])))
      model.add(MaxPooling3D(pool_size=(nb_pool[2], nb_pool[2], nb_pool[2])))
      model.add(Flatten())
      # TODO(rbharath): If we change away from axis-size 32, this code will break.
      # Eventually figure out a more general rule that works for all axis sizes.
  model.add(Dense(32/2, init='normal'))
      model.add(Dense(16, init='normal'))
      model.add(Activation('relu'))
      model.add(Dropout(0.5))
  # TODO(rbharath): Generalize this to support classification as well as regression.
      model.add(Dense(1, init='normal'))

      sgd = RMSprop(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True)
  print "About to compile model"
      print("About to compile model")
      model.compile(loss=loss_function, optimizer=sgd)
  print "About to fit data to model."
  model.fit(X, y, batch_size=batch_size, nb_epoch=nb_epoch)
  return model
      self.raw_model = model

  def fit_on_batch(self, X, y, w):
    # TODO(rbharath): Modify the featurization so that it matches desired shaped.
    X = shuffle_data(X)
    loss = self.raw_model.train_on_batch(X, y)
    print("Loss: %f" % loss)

  def predict_on_batch(self, X):
    if len(np.shape(X)) != 5:
      raise ValueError(
          "Tensorial datatype must be of shape (n_samples, N, N, N, n_channels).")
    X = shuffle_data(X)
    y_pred = self.raw_model.predict_on_batch(X)
    y_pred = np.squeeze(y_pred)
    return y_pred
+30 −0
Original line number Diff line number Diff line
"""
Factory function to construct models.
"""

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

from deep_chem.models.deep import SingleTaskDNN
from deep_chem.models.deep import MultiTaskDNN
from deep_chem.models.deep3d import DockingDNN
from deep_chem.models.standard import SklearnModel

def model_builder(model_type, task_types, model_params,
                  initialize_raw_model=True):
  """
  Factory function to construct model.
  """
  if model_type == "singletask_deep_network":
    model = SingleTaskDNN(task_types, model_params,
                          initialize_raw_model)
  elif model_type == "multitask_deep_network":
    model = MultiTaskDNN(task_types, model_params,
                         initialize_raw_model)
  elif model_type == "convolutional_3D_regressor":
    model = DockingDNN(task_types, model_params,
                       initialize_raw_model)
  else:
    model = SklearnModel(task_types, model_params)
  return model
+24 −0
Original line number Diff line number Diff line
@@ -9,6 +9,30 @@ from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LassoLarsCV
from deep_chem.models import Model

class SklearnModel(Model):
  """
  Abstract base class for different ML models.
  """
  def __init__(self, task_types, model_params, initialize_raw_model=True):
    self.task_types = task_types
    self.model_params = model_params
    self.raw_model = None

  def fit_on_batch(self, X, y, w):
    """
    Updates existing model with new information.
    """
    raise NotImplementedError(
        "Each model is responsible for its own fit_on_batch method.")

  def predict_on_batch(self, X):
    """
    Makes predictions on given batch of new data.
    """
    raise NotImplementedError(
        "Each model is responsible for its own predict_on_batch method.")   

def fit_singletask_models(train_data, modeltype):
  """Fits singletask linear regression models to potency.
Loading