Commit 415033d0 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Merge pull request #35 from evanfeinberg/master

Working with Data Sets exceeding RAM 
parents 6d4f6df0 34925328
Loading
Loading
Loading
Loading
+101 −0
Original line number Original line Diff line number Diff line
"""
Contains an abstract base class that supports different ML models.
"""

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

#TODO(enf/rbharath): incorporate save, load, eval, fit features into class Model.
class Model(object):
  """
  Abstract base class for different ML models.
  """
  def __init__(self, task_types, model_params, initialize_raw_model=True):
    self.task_types = task_types
    self.model_params = model_params

  def fit_on_batch(self, X, y, w):
    """
    Updates existing model with new information.
    """
    raise NotImplementedError(
        "Each model is responsible for its own fit_on_batch method.")

  def predict_on_batch(self, X):
    """
    Makes predictions on given batch of new data.
    """
    raise NotImplementedError(
        "Each model is responsible for its own predict_on_batch method.")    

  def set_raw_model(self, raw_model):
    """
    Set underlying raw model. Useful when loading from disk.
    """
    self.raw_model = raw_model

  def get_raw_model(self):
    """
    Return raw model.
    """
    return(self.raw_model)


'''
def model_predictions(X, model, n_targets, task_types, modeltype="sklearn"):
  """Obtains predictions of provided model on test_set.

  Returns an ndarray of shape (n_samples, n_targets)

  TODO(rbharath): This function uses n_targets instead of
  task_transforms like everything else.

  Parameters
  ----------
  X: numpy.ndarray
    Test set data.
  model: model.
    A trained scikit-learn or keras model.
  n_targets: int
    Number of output targets
  task_types: dict
    dict mapping target names to output type. Each output type must be either
    "classification" or "regression".
  modeltype: string
    Either sklearn, keras, or keras_multitask
  """
  # Extract features for test set and make preds
  # TODO(rbharath): This change in shape should not(!) be handled here. Make
  # an upstream change so the evaluator doesn't have to worry about this.
  if len(np.shape(X)) > 2:  # Dealing with 3D data
    if len(np.shape(X)) != 5:
      raise ValueError(
          "Tensorial datatype must be of shape (n_samples, N, N, N, n_channels).")
    (n_samples, axis_length, _, _, n_channels) = np.shape(X)
    X = np.reshape(X, (n_samples, axis_length, n_channels, axis_length, axis_length))
  if modeltype == "keras-graph":
    predictions = model.predict({"input": X})
    ypreds = []
    for index in range(n_targets):
      ypreds.append(predictions["task%d" % index])
  elif modeltype == "sklearn":
    # Must be single-task (breaking multitask RFs here)
    task_type = task_types.itervalues().next()
    if task_type == "classification":
      print("model_predictions()")
      print("np.shape(X)")
      print(np.shape(X))
      ypreds = model.predict_proba(X)
    elif task_type == "regression":
      ypreds = model.predict(X)
  elif modeltype == "keras-sequential":
    ypreds = model.predict(X)
  else:
    raise ValueError("Improper modeltype.")
  if isinstance(ypreds, np.ndarray):
    ypreds = np.squeeze(ypreds)
  if not isinstance(ypreds, list):
    ypreds = [ypreds]
  return ypreds
'''
+67 −3
Original line number Original line Diff line number Diff line
@@ -5,8 +5,72 @@ import numpy as np
from keras.models import Graph
from keras.models import Graph
from keras.layers.core import Dense, Dropout
from keras.layers.core import Dense, Dropout
from keras.optimizers import SGD
from keras.optimizers import SGD
from deep_chem.utils.preprocess import to_one_hot
from deep_chem.models import Model


#TODO(rbharath/enf): Make this real. It's a dummy now.
class SingleTaskDNN(Model):
  """
  Abstract base class for different ML models.
  """
  def __init__(self, task_types, model_params, initialize_raw_model=True):
    self.task_types = task_types
    self.model_params = model_params
    self.raw_model = None

  def fit_on_batch(self, X, y, w):
    """
    Updates existing model with new information.
    """
    raise NotImplementedError(
        "Each model is responsible for its own fit_on_batch method.")

  def predict_on_batch(self, X):
    """
    Makes predictions on given batch of new data.
    """
    raise NotImplementedError(
        "Each model is responsible for its own predict_on_batch method.")    

#TODO(rbharath/enf): Make this real. It's a dummy now.
class MultiTaskDNN(Model):
  """
  Abstract base class for different ML models.
  """
  def __init__(self, task_types, model_params, initialize_raw_model=True):
    self.task_types = task_types
    self.model_params = model_params
    self.raw_model = None

  def fit_on_batch(self, X, y, w):
    """
    Updates existing model with new information.
    """
    raise NotImplementedError(
        "Each model is responsible for its own fit_on_batch method.")

  def predict_on_batch(self, X):
    """
    Makes predictions on given batch of new data.
    """
    raise NotImplementedError(
        "Each model is responsible for its own predict_on_batch method.")   

def to_one_hot(y):
  """Transforms label vector into one-hot encoding.

  Turns y into vector of shape [n_samples, 2] (assuming binary labels).

  y: np.ndarray
    A vector of shape [n_samples, 1]
  """
  n_samples = np.shape(y)[0]
  y_hot = np.zeros((n_samples, 2))
  for index, val in enumerate(y):
    if val == 0:
      y_hot[index] = np.array([1, 0])
    elif val == 1:
      y_hot[index] = np.array([0, 1])
  return y_hot


def fit_multitask_mlp(train_data, task_types, **training_params):
def fit_multitask_mlp(train_data, task_types, **training_params):
  """
  """
@@ -62,7 +126,7 @@ def fit_singletask_mlp(train_data, task_types, **training_params):


def train_multitask_model(X, y, W, task_types, learning_rate=0.01,
def train_multitask_model(X, y, W, task_types, learning_rate=0.01,
                          decay=1e-6, momentum=0.9, nesterov=True, activation="relu",
                          decay=1e-6, momentum=0.9, nesterov=True, activation="relu",
                          dropout=0.5, nb_epoch=20, batch_size=50, n_hidden=500,
                          dropout=0.5, nb_epoch=20, batch_size=50, nb_hidden=500,
                          validation_split=0.1):
                          validation_split=0.1):
  """
  """
  Perform stochastic gradient descent optimization for a keras multitask MLP.
  Perform stochastic gradient descent optimization for a keras multitask MLP.
@@ -106,7 +170,7 @@ def train_multitask_model(X, y, W, task_types, learning_rate=0.01,
  #model.add_input(name="input", ndim=n_inputs)
  #model.add_input(name="input", ndim=n_inputs)
  model.add_input(name="input", input_shape=(n_inputs,))
  model.add_input(name="input", input_shape=(n_inputs,))
  model.add_node(
  model.add_node(
      Dense(n_hidden, init='uniform', activation=activation),
      Dense(nb_hidden, init='uniform', activation=activation),
      name="dense", input="input")
      name="dense", input="input")
  model.add_node(Dropout(dropout), name="dropout", input="dense")
  model.add_node(Dropout(dropout), name="dropout", input="dense")
  top_layer = "dropout"
  top_layer = "dropout"
+72 −62
Original line number Original line Diff line number Diff line
"""
"""
Code for training 3D convolutions.
Code for training 3D convolutions.
"""
"""

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import numpy as np
import numpy as np
from keras.optimizers import RMSprop
from keras.optimizers import RMSprop
from keras.models import Sequential
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution3D, MaxPooling3D
from keras.layers.convolutional import Convolution3D, MaxPooling3D
from deep_chem.models import Model


def fit_3D_convolution(train_data, **training_params):
def shuffle_shape(shape):
  """
  (axis_length, _, _, n_channels) = shape
  Perform stochastic gradient descent for a 3D CNN.
  shuffled_shape = (n_channels, axis_length, axis_length, axis_length)
  """
  return shuffled_shape
  models = {}
  X_train = train_data["features"]
  if len(train_data["sorted_tasks"]) > 1:
    raise ValueError("3D Convolutions only supported for singletask.")
  task_name = train_data["sorted_tasks"][0]
  (y_train, _) = train_data["sorted_tasks"].itervalues().next()
  models[task_name] = train_3D_convolution(X_train, y_train, **training_params)
  return models


def train_3D_convolution(X, y, batch_size=50, nb_epoch=1, learning_rate=0.01,
def shuffle_data(X):
                         loss_function="mean_squared_error"):
  (n_samples, axis_length, _, _, n_channels) = np.shape(X)
  X = np.reshape(X, (n_samples, n_channels, axis_length, axis_length, axis_length))
  return X


  """
  Fit a keras 3D CNN to datat.


  Parameters
class DockingDNN(Model):
  ----------
  nb_epoch: int
    maximal number of epochs to run the optimizer
  """
  """
  print "Training 3D model"
  Wrapper class for fitting 3D convolutional networks for deep docking.
  print "Original shape of X: " + str(np.shape(X))
  """
  print "Shuffling X dimensions to match convnet"
  def __init__(self, task_types, model_params, initialize_raw_model=True):
  # TODO(rbharath): Modify the featurization so that it matches desired shaped.
    super(DockingDNN, self).__init__(task_types, model_params, initialize_raw_model)
  (n_samples, axis_length, _, _, n_channels) = np.shape(X)
    if initialize_raw_model:
  X = np.reshape(X, (n_samples, axis_length, n_channels, axis_length, axis_length))
      (axis_length, _, _, n_channels) = model_params["data_shape"]
  print "Final shape of X: " + str(np.shape(X))
      self.input_shape = (n_channels, 
                          axis_length, axis_length, axis_length)

      learning_rate = model_params["learning_rate"]
      loss_function = model_params["loss_function"]


         # number of convolutional filters to use at each layer
         # number of convolutional filters to use at each layer
      nb_filters = [axis_length/2, axis_length, axis_length]
      nb_filters = [axis_length/2, axis_length, axis_length]
@@ -47,35 +46,46 @@ def train_3D_convolution(X, y, batch_size=50, nb_epoch=1, learning_rate=0.01,


      # level of convolution to perform at each layer (CONV x CONV)
      # level of convolution to perform at each layer (CONV x CONV)
      nb_conv = [7, 5, 3]
      nb_conv = [7, 5, 3]

      model = Sequential()
      model = Sequential()
  model.add(Convolution3D(nb_filter=nb_filters[0], stack_size=n_channels,

      model.add(Convolution3D(nb_filter=nb_filters[0], nb_depth=nb_conv[0], 
                              nb_row=nb_conv[0], nb_col=nb_conv[0],
                              nb_row=nb_conv[0], nb_col=nb_conv[0],
                          nb_depth=nb_conv[0], border_mode='valid'))
                              input_shape=self.input_shape, border_mode="full"))
      model.add(Activation('relu'))
      model.add(Activation('relu'))
  model.add(MaxPooling3D(poolsize=(nb_pool[0], nb_pool[0], nb_pool[0])))

  model.add(Convolution3D(nb_filter=nb_filters[1], stack_size=nb_filters[0],
      model.add(MaxPooling3D(pool_size=(nb_pool[0], nb_pool[0], nb_pool[0])))
                          nb_row=nb_conv[1], nb_col=nb_conv[1], nb_depth=nb_conv[1],
      model.add(Convolution3D(nb_filter=nb_filters[1],  nb_depth=nb_conv[1],
                          border_mode='valid'))
                              nb_row=nb_conv[1], nb_col=nb_conv[1], border_mode="full"))
      model.add(Activation('relu'))
      model.add(Activation('relu'))
  model.add(MaxPooling3D(poolsize=(nb_pool[1], nb_pool[1], nb_pool[1])))
      model.add(MaxPooling3D(pool_size=(nb_pool[1], nb_pool[1], nb_pool[1])))
  model.add(Convolution3D(nb_filter=nb_filters[2], stack_size=nb_filters[1],
      model.add(Convolution3D(nb_filter=nb_filters[2], nb_depth=nb_conv[2],
                          nb_row=nb_conv[2], nb_col=nb_conv[2],
                              nb_row=nb_conv[2], nb_col=nb_conv[2], border_mode="full"))
                          nb_depth=nb_conv[2], border_mode='valid'))
      model.add(Activation('relu'))
      model.add(Activation('relu'))
  model.add(MaxPooling3D(poolsize=(nb_pool[2], nb_pool[2], nb_pool[2])))
      model.add(MaxPooling3D(pool_size=(nb_pool[2], nb_pool[2], nb_pool[2])))
      model.add(Flatten())
      model.add(Flatten())
      # TODO(rbharath): If we change away from axis-size 32, this code will break.
      # TODO(rbharath): If we change away from axis-size 32, this code will break.
      # Eventually figure out a more general rule that works for all axis sizes.
      # Eventually figure out a more general rule that works for all axis sizes.
  model.add(Dense(32/2, init='normal'))
      model.add(Dense(16, init='normal'))
      model.add(Activation('relu'))
      model.add(Activation('relu'))
      model.add(Dropout(0.5))
      model.add(Dropout(0.5))
  # TODO(rbharath): Generalize this to support classification as well as regression.
      model.add(Dense(1, init='normal'))
      model.add(Dense(1, init='normal'))


      sgd = RMSprop(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True)
      sgd = RMSprop(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True)
  print "About to compile model"
      print("About to compile model")
      model.compile(loss=loss_function, optimizer=sgd)
      model.compile(loss=loss_function, optimizer=sgd)
  print "About to fit data to model."
      self.raw_model = model
  model.fit(X, y, batch_size=batch_size, nb_epoch=nb_epoch)

  return model
  def fit_on_batch(self, X, y, w):
    # TODO(rbharath): Modify the featurization so that it matches desired shaped.
    X = shuffle_data(X)
    loss = self.raw_model.train_on_batch(X, y)
    print("Loss: %f" % loss)

  def predict_on_batch(self, X):
    if len(np.shape(X)) != 5:
      raise ValueError(
          "Tensorial datatype must be of shape (n_samples, N, N, N, n_channels).")
    X = shuffle_data(X)
    y_pred = self.raw_model.predict_on_batch(X)
    y_pred = np.squeeze(y_pred)
    return y_pred
+30 −0
Original line number Original line Diff line number Diff line
"""
Factory function to construct models.
"""

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

from deep_chem.models.deep import SingleTaskDNN
from deep_chem.models.deep import MultiTaskDNN
from deep_chem.models.deep3d import DockingDNN
from deep_chem.models.standard import SklearnModel

def model_builder(model_type, task_types, model_params,
                  initialize_raw_model=True):
  """
  Factory function to construct model.
  """
  if model_type == "singletask_deep_network":
    model = SingleTaskDNN(task_types, model_params,
                          initialize_raw_model)
  elif model_type == "multitask_deep_network":
    model = MultiTaskDNN(task_types, model_params,
                         initialize_raw_model)
  elif model_type == "convolutional_3D_regressor":
    model = DockingDNN(task_types, model_params,
                       initialize_raw_model)
  else:
    model = SklearnModel(task_types, model_params)
  return model
+24 −0
Original line number Original line Diff line number Diff line
@@ -9,6 +9,30 @@ from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LassoLarsCV
from sklearn.linear_model import LassoLarsCV
from deep_chem.models import Model

class SklearnModel(Model):
  """
  Abstract base class for different ML models.
  """
  def __init__(self, task_types, model_params, initialize_raw_model=True):
    self.task_types = task_types
    self.model_params = model_params
    self.raw_model = None

  def fit_on_batch(self, X, y, w):
    """
    Updates existing model with new information.
    """
    raise NotImplementedError(
        "Each model is responsible for its own fit_on_batch method.")

  def predict_on_batch(self, X):
    """
    Makes predictions on given batch of new data.
    """
    raise NotImplementedError(
        "Each model is responsible for its own predict_on_batch method.")   


def fit_singletask_models(train_data, modeltype):
def fit_singletask_models(train_data, modeltype):
  """Fits singletask linear regression models to potency.
  """Fits singletask linear regression models to potency.
Loading