Commit 658c0b91 authored by evanfeinberg's avatar evanfeinberg
Browse files

major refactor. added eval_on_batch

parent 0fdabafb
Loading
Loading
Loading
Loading
+94 −4
Original line number Diff line number Diff line
class Model:
  def __init__(self, task_types, training_params):
"""
Contains an abstract base class that supports different ML models.
"""

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

#TODO(enf/rbharath): incorporate save, load, eval, fit features into class Model.
class Model(object):
  """
  Abstract base class for different ML models.
  """
  def __init__(self, task_types, model_params, initialize_raw_model=True):
    self.task_types = task_types
    self.training_params = training_params
    self.model_params = model_params
    self.raw_model = None

  def fit_on_batch(self, X, y, w):
    """
    Updates existing model with new information.
    """
    raise NotImplementedError(
        "Each model is responsible for its own fit_on_batch method.")

  def predict_on_batch(self, X):
    """
    Makes predictions on given batch of new data.
    """
    raise NotImplementedError(
        "Each model is responsible for its own predict_on_batch method.")    

  def set_raw_model(self, raw_model):
    """
    Set underlying raw model. Useful when loading from disk.
    """
    self.raw_model = raw_model

  def get_raw_model(self):
    """
    Return raw model.
    """
    return(self.raw_model)


'''
def model_predictions(X, model, n_targets, task_types, modeltype="sklearn"):
  """Obtains predictions of provided model on test_set.

  Returns an ndarray of shape (n_samples, n_targets)

  TODO(rbharath): This function uses n_targets instead of
  task_transforms like everything else.

  Parameters
  ----------
  X: numpy.ndarray
    Test set data.
  model: model.
    A trained scikit-learn or keras model.
  n_targets: int
    Number of output targets
  task_types: dict
    dict mapping target names to output type. Each output type must be either
    "classification" or "regression".
  modeltype: string
    Either sklearn, keras, or keras_multitask
  """
  # Extract features for test set and make preds
  # TODO(rbharath): This change in shape should not(!) be handled here. Make
  # an upstream change so the evaluator doesn't have to worry about this.
  if len(np.shape(X)) > 2:  # Dealing with 3D data
    if len(np.shape(X)) != 5:
      raise ValueError(
          "Tensorial datatype must be of shape (n_samples, N, N, N, n_channels).")
    (n_samples, axis_length, _, _, n_channels) = np.shape(X)
    X = np.reshape(X, (n_samples, axis_length, n_channels, axis_length, axis_length))
  if modeltype == "keras-graph":
    predictions = model.predict({"input": X})
    ypreds = []
    for index in range(n_targets):
      ypreds.append(predictions["task%d" % index])
  elif modeltype == "sklearn":
    # Must be single-task (breaking multitask RFs here)
    task_type = task_types.itervalues().next()
    if task_type == "classification":
      print("model_predictions()")
      print("np.shape(X)")
      print(np.shape(X))
      ypreds = model.predict_proba(X)
    elif task_type == "regression":
      ypreds = model.predict(X)
  elif modeltype == "keras-sequential":
    ypreds = model.predict(X)
  else:
    raise ValueError("Improper modeltype.")
  if isinstance(ypreds, np.ndarray):
    ypreds = np.squeeze(ypreds)
  if not isinstance(ypreds, list):
    ypreds = [ypreds]
  return ypreds
'''
+64 −77
Original line number Diff line number Diff line
"""
Code for training 3D convolutions.
"""

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import numpy as np
from keras.optimizers import RMSprop
from keras.models import Sequential
@@ -8,11 +13,13 @@ from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution3D, MaxPooling3D
from deep_chem.models import Model


class DockingDNN(Model):
  def __init__(self, task_types, model_params):
    (n_samples, axis_length, _, _, n_channels) = model_params["data_shape"]
    self.model_params = model_params
  """
  Wrapper class for fitting 3D convolutional networks for deep docking.
  """
  def __init__(self, task_types, model_params, initialize_raw_model=True):
    if initialize_raw_model:
      (_, axis_length, _, _, n_channels) = model_params["data_shape"]

      learning_rate = model_params["learning_rate"]
      loss_function = model_params["loss_function"]
@@ -25,7 +32,6 @@ class DockingDNN(Model):

      # level of convolution to perform at each layer (CONV x CONV)
      nb_conv = [7, 5, 3]

      model = Sequential()
      model.add(Convolution3D(nb_filter=nb_filters[0], stack_size=n_channels,
                              nb_row=nb_conv[0], nb_col=nb_conv[0],
@@ -52,53 +58,34 @@ class DockingDNN(Model):
      model.add(Dense(1, init='normal'))

      sgd = RMSprop(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True)
    print "About to compile model"
      print("About to compile model")
      model.compile(loss=loss_function, optimizer=sgd)
      self.model = model
    super(DockingDNN, self).__init__(task_types, training_params)
    super(DockingDNN, self).__init__(task_types, model_params, initialize_raw_model)

  def fit_on_batch(self, X, y, w):
    print "Training 3D model"
    print "Original shape of X: " + str(np.shape(X))
    print "Shuffling X dimensions to match convnet"
    print("Training 3D model")
    print("Original shape of X: " + str(np.shape(X)))
    print("Shuffling X dimensions to match convnet")
    # TODO(rbharath): Modify the featurization so that it matches desired shaped.
    (n_samples, axis_length, _, _, n_channels) = np.shape(X)
    X = np.reshape(X, (n_samples, axis_length, n_channels, axis_length, axis_length))
    print "Final shape of X: " + str(np.shape(X))
    print("Final shape of X: " + str(np.shape(X)))

    print "About to fit data to model."
    print("About to fit data to model.")
    batch_size = self.model_params["batch_size"]
    nb_epoch = self.model_params["nb_epoch"]
    y = y.itervalues().next()
    model.train_on_batch(X, y, batch_size=batch_size, nb_epoch=nb_epoch)
    self.raw_model.train_on_batch(X, y, batch_size=batch_size, nb_epoch=nb_epoch)
    print("Finished training on batch.")

'''
def fit_3D_convolution(train_data, **training_params):
  """
  Perform stochastic gradient descent for a 3D CNN.
  """
  models = {}
  X_train = train_data["features"]
  if len(train_data["sorted_tasks"]) > 1:
    raise ValueError("3D Convolutions only supported for singletask.")
  task_name = train_data["sorted_tasks"][0]
  (y_train, _) = train_data["sorted_tasks"].itervalues().next()
  models[task_name] = train_3D_convolution(X_train, y_train, **training_params)
  return models
'''
'''
def train_3D_convolution(X, y, batch_size=50, nb_epoch=1, learning_rate=0.01,
                         loss_function="mean_squared_error"):

  """
  Fit a keras 3D CNN to datat.

  Parameters
  ----------
  nb_epoch: int
    maximal number of epochs to run the optimizer
  """
  def predict_on_batch(self, X):
    if len(np.shape(X)) != 5:
      raise ValueError(
          "Tensorial datatype must be of shape (n_samples, N, N, N, n_channels).")
    (n_samples, axis_length, _, _, n_channels) = np.shape(X)
    X = np.reshape(X, (n_samples, axis_length, n_channels, axis_length, axis_length))
    y_pred = self.raw_model.predict_on_batch(X)
    y_pred = np.squeeze(y_pred)
    return(y_pred)
    
 No newline at end of file
  return model
'''
 No newline at end of file
+9 −6
Original line number Diff line number Diff line
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

#from deep_chem.models.deep import SingleTaskDNN
#from deep_chem.models.deep import MultiTaskDNN
from deep_chem.models.deep3D import DockingDNN

def model_builder(model_type, task_type, training_params):
def model_builder(model_type, task_types, model_params):
  if model_type == "singletask_deep_network":
    model = SingleTaskDNN(task_types, training_params)
    model = SingleTaskDNN(task_types, model_params)
  elif model_type == "multitask_deep_network":
    model = MultiTaskDNN(task_types, training_params)
    model = MultiTaskDNN(task_types, model_params)
  elif model_type== "3D_cnn":
    model = DockingDNN(task_types, training_params)
    model = DockingDNN(task_types, model_params)
  else:
    #model = sklean_models(train_dict, model)
    raise ValueError("Model type not recognized.")
    model = SklearnModel(task_types, model_params)
  return(model)
+56 −80
Original line number Diff line number Diff line
@@ -6,26 +6,11 @@ from __future__ import division
from __future__ import unicode_literals
import argparse
import os
from deep_chem.utils.featurize import generate_directories
from deep_chem.utils.featurize import extract_data
from deep_chem.utils.featurize import generate_targets
from deep_chem.utils.featurize import generate_features
from deep_chem.utils.featurize import generate_vs_utils_features
from deep_chem.utils.featurize import featurize_inputs
from deep_chem.models.standard import fit_singletask_models
from deep_chem.utils.load import process_datasets
from deep_chem.utils.load import transform_data
from deep_chem.utils.save import save_model
from deep_chem.utils.save import load_model
from deep_chem.utils.save import save_sharded_dataset
from deep_chem.utils.save import load_sharded_dataset
from deep_chem.utils.evaluate import results_to_csv
from deep_chem.utils.evaluate import eval_trained_model
from deep_chem.utils.evaluate import compute_model_performance
from deep_chem.utils.preprocess import train_test_split
from deep_chem.utils.fit import fit_model


def add_featurization_command(subparsers):
  """Adds flags for featurize subcommand."""
  featurize_cmd = subparsers.add_parser(
@@ -44,9 +29,6 @@ def add_featurize_group(featurize_cmd):
      help="Type of input file. If pandas, input must be a joblib\n"
           "containing a pandas dataframe. If sdf, should be in\n"
           "(perhaps gzipped) sdf file.")
  featurize_group.add_argument(
      "--delimiter", default=",", type=str,
      help="If csv input, delimiter to use for read csv file")
  featurize_group.add_argument(
      "--fields", required=1, nargs="+",
      help="Names of fields.")
@@ -58,7 +40,7 @@ def add_featurize_group(featurize_cmd):
      "--feature-fields", type=str, nargs="+",
      help="Optional field that holds pre-computed feature vector")
  featurize_group.add_argument(
      "--target-fields", type=str, nargs="+", required=1,
      "--task-fields", type=str, nargs="+", required=1,
      help="Name of measured field to predict.")
  featurize_group.add_argument(
      "--split-field", type=str, default=None,
@@ -75,11 +57,9 @@ def add_featurize_group(featurize_cmd):
      "--threshold", type=float, default=None,
      help="If specified, will be used to binarize real-valued target-fields.")
  featurize_group.add_argument(
      "--name", required=1,
      help="Name of the dataset.")
  featurize_group.add_argument(
      "--out", required=1,
      help="Folder to generate processed dataset in.")
      "--feature-dir", type=str, required=1,
      help="Directory where featurized dataset will be stored. \n"
           "Will be created if does not exist")
  featurize_group.set_defaults(func=featurize_inputs_wrapper)

def add_train_test_command(subparsers):
@@ -118,11 +98,8 @@ def add_train_test_command(subparsers):
      choices=["singletask", "multitask"],
      help="Type of model being built.")
  train_test_cmd.add_argument(
      "--train-out", type=str, required=1,
      help="Location to save train set.")
  train_test_cmd.add_argument(
      "--test-out", type=str, required=1,
      help="Location to save test set.")
      "--data-dir", type=str, required=1,
      help="Location to save train and test data.")
  train_test_cmd.set_defaults(func=train_test_split_wrapper)

def add_model_group(fit_cmd):
@@ -160,10 +137,6 @@ def add_model_group(fit_cmd):
  group.add_argument(
      "--decay", type=float, default=1e-4,
      help="Learning rate decay for NN models.")
  group.add_argument(
      "--validation-split", type=float, default=0.0,
      help="Percent of training data to use for validation.")


def add_fit_command(subparsers):
  """Adds arguments for fit subcommand."""
@@ -171,16 +144,15 @@ def add_fit_command(subparsers):
      "fit", help="Fit a model to training data.")
  group = fit_cmd.add_argument_group("load-and-transform")
  group.add_argument(
      "--saved-data", required=1,
      "--data-dir", required=1,
      help="Location of saved transformed data.")
  add_model_group(fit_cmd)
  group = fit_cmd.add_argument_group("save")
  group.add_argument(
      "--saved-out", type=str, required=1,
      "--model-dir", type=str, required=1,
      help="Location to save trained model.")
  fit_cmd.set_defaults(func=fit_model_wrapper)


def add_eval_command(subparsers):
  """Adds arguments for eval subcommand."""
  eval_cmd = subparsers.add_parser(
@@ -249,63 +221,70 @@ def add_model_command(subparsers):
  add_model_group(model_cmd)
  model_cmd.set_defaults(func=create_model)

def extract_training_params(args):
def extract_model_params(args):
  """
  Given input arguments, return a dict specifiying model parameters.
  """
  params = ["nb_hidden", "learning_rate", "dropout",
            "nb_epoch", "decay", "batch_size", "loss_function"]

  training_params = {param : getattr(args, param) for param in params}
  return(training_params)
  model_params = {param : getattr(args, param) for param in params}
  return(model_params)

def create_model(args):
  """Creates a model"""
  data_dir = os.path.join(args.out, args.name)
  feature_dir = args.feature_dir
  if not os.path.exists(feature_dir):
    os.makedirs(feature_dir)

  data_dir = args.data_dir
  if not os.path.exists(data_dir):
    os.makedirs(data_dir)

  model_dir = args.model_dir
  model_name = args.model

  print("+++++++++++++++++++++++++++++++++")
  print("Perform featurization")
  if not args.skip_featurization:
    featurize_inputs(
        args.name, args.out, args.input_file, args.input_type, args.fields,
        args.field_types, args.feature_fields, args.target_fields,
        args.smiles_field, args.split_field, args.id_field, args.threshold,
        args.delimiter)
        feature_dir, args.input_files, args.input_type, args.fields,
        args.field_types, args.feature_fields, args.task_fields,
        args.smiles_field, args.split_field, args.id_field, args.threshold)

  print("+++++++++++++++++++++++++++++++++")
  print("Perform train-test split")
  paths = [data_dir]
  train_out = os.path.join(data_dir, "%s-train.joblib" % args.name)
  test_out = os.path.join(data_dir, "%s-test.joblib" % args.name)
  paths = [feature_dir]
  if not args.skip_train_test_split:
    train_test_split(
        paths, args.output_transforms, args.input_transforms, args.feature_types,
        args.splittype, args.mode, train_out, test_out,
        args.target_fields)
        args.splittype, args.mode, data_dir)

  print("+++++++++++++++++++++++++++++++++")
  print("Fit model")
  model_type = get_model_type(args.model)
  extension = get_model_extension(model_type)
  saved_out = os.path.join(data_dir, "%s.%s" % (args.model, extension))
  if not args.skip_fit:
    training_params = extract_training_params(args)
    model_params = extract_model_params(args)
    fit_model(
        args.model, training_params,
        args.validation_split, saved_out, train_out, args.target_fields)

        model_name, model_params, model_dir, data_dir)

  print("+++++++++++++++++++++++++++++++++")
  print("Eval Model on Train")
  print("-------------------")
  csv_out_train = os.path.join(data_dir, "%s-train.csv" % args.name)
  stats_out_train = os.path.join(data_dir, "%s-train-stats.txt" % args.name)
  csv_out_test = os.path.join(data_dir, "%s-test.csv" % args.name)
  stats_out_test = os.path.join(data_dir, "%s-test-stats.txt" % args.name)
  csv_out_train = os.path.join(data_dir, "train.csv")
  stats_out_train = os.path.join(data_dir, "train-stats.txt")
  csv_out_test = os.path.join(data_dir, "test.csv")
  stats_out_test = os.path.join(data_dir, "test-stats.txt")
  eval_trained_model(
      model_type, saved_out, train_out, csv_out_train, 
      stats_out_train, args.target_fields)
      model_type, model_dir, data_dir, csv_out_train,

      stats_out_train, args.task_fields, split="train")
  print("Eval Model on Test")
  print("------------------")
  eval_trained_model(
      model_type, saved_out, test_out, csv_out_test, 
      stats_out_test, args.target_fields)
      model_type, model_dir, data_dir, csv_out_test,

      stats_out_test, args.task_fields, split="test")

def parse_args(input_args=None):
  """Parse command-line arguments."""
@@ -323,27 +302,26 @@ def parse_args(input_args=None):

def featurize_inputs_wrapper(args):
  """Wrapper function that calls _featurize_input with args unwrapped."""
  if not os.path.exists(args.feature_dir):
    os.makedirs(args.feature_dir)
  featurize_inputs(
      args.name, args.out, args.input_file, args.input_type, args.fields,
      args.field_types, args.feature_fields, args.target_fields,
      args.smiles_field, args.split_field, args.id_field, args.threshold,
      args.delimiter)
      args.feature_dir, args.input_files, args.input_type, args.fields,
      args.field_types, args.feature_fields, args.task_fields,
      args.smiles_field, args.split_field, args.id_field, args.threshold)

def train_test_split_wrapper(args):
  """Wrapper function that calls _train_test_split_wrapper after unwrapping args."""
  preprocess.train_test_split(
      args.paths, args.output_transforms, args.input_transforms,
      args.feature_types, args.splittype, args.mode,
      args.train_out, args.test_out, args.target_fields)
  train_test_split(args.paths, args.output_transforms,

                   args.input_transforms, args.feature_types,

                   args.splittype, args.mode, args.data_dir)

def fit_model_wrapper(args):
  """Wrapper that calls _fit_model with arguments unwrapped."""
  training_params = extract_training_params(args)
  model_params = extract_model_params(args)
  fit_model(
      args.model, training_params, args.validation_split,
      args.saved_out, args.saved_data, args.target_fields)


      args.model_name, model_params, args.model_dir, args.data_dir)

def get_model_type(model):
  """Associate each model with a model_type (used for saving/loading)."""
@@ -369,10 +347,8 @@ def get_model_extension(model_type):
def eval_trained_model_wrapper(args):
  """Wrapper function that calls _eval_trained_model with unwrapped args."""
  eval_trained_model(
      args.model_type, args.saved_model, args.saved_data,
      args.csv_out, args.stats_out, args.target_fields)


      args.model_type, args.saved_model, args.data_dir,
      args.csv_out, args.stats_out, args.task_fields)

def main():
  """Invokes argument parser."""
+79 −327

File changed.

Preview size limit exceeded, changes collapsed.

Loading