Commit 0fdabafb authored by evanfeinberg's avatar evanfeinberg
Browse files

added new model and fit framework

parent 9c5594d2
Loading
Loading
Loading
Loading
+12 −0
Original line number Diff line number Diff line
class Model:
  def __init__(self, task_types, training_params):
    self.task_types = task_types
    self.training_params = training_params

  def fit_on_batch(self, X, y, w):
    raise NotImplementedError(
        "Each model is responsible for its own fit_on_batch method.")

  def predict_on_batch(self, X):
    raise NotImplementedError(
        "Each model is responsible for its own predict_on_batch method.")    
 No newline at end of file
+71 −48
Original line number Diff line number Diff line
@@ -6,38 +6,16 @@ from keras.optimizers import RMSprop
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution3D, MaxPooling3D
from deep_chem.models import Model

def fit_3D_convolution(train_data, **training_params):
  """
  Perform stochastic gradient descent for a 3D CNN.
  """
  models = {}
  X_train = train_data["features"]
  if len(train_data["sorted_tasks"]) > 1:
    raise ValueError("3D Convolutions only supported for singletask.")
  task_name = train_data["sorted_tasks"][0]
  (y_train, _) = train_data["sorted_tasks"].itervalues().next()
  models[task_name] = train_3D_convolution(X_train, y_train, **training_params)
  return models

def train_3D_convolution(X, y, batch_size=50, nb_epoch=1, learning_rate=0.01,
                         loss_function="mean_squared_error"):

  """
  Fit a keras 3D CNN to datat.
class DockingDNN(Model):
  def __init__(self, task_types, model_params):
    (n_samples, axis_length, _, _, n_channels) = model_params["data_shape"]
    self.model_params = model_params

  Parameters
  ----------
  nb_epoch: int
    maximal number of epochs to run the optimizer
  """
  print "Training 3D model"
  print "Original shape of X: " + str(np.shape(X))
  print "Shuffling X dimensions to match convnet"
  # TODO(rbharath): Modify the featurization so that it matches desired shaped.
  (n_samples, axis_length, _, _, n_channels) = np.shape(X)
  X = np.reshape(X, (n_samples, axis_length, n_channels, axis_length, axis_length))
  print "Final shape of X: " + str(np.shape(X))
    learning_rate = model_params["learning_rate"]
    loss_function = model_params["loss_function"]

       # number of convolutional filters to use at each layer
    nb_filters = [axis_length/2, axis_length, axis_length]
@@ -76,6 +54,51 @@ def train_3D_convolution(X, y, batch_size=50, nb_epoch=1, learning_rate=0.01,
    sgd = RMSprop(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True)
    print "About to compile model"
    model.compile(loss=loss_function, optimizer=sgd)
    self.model = model
    super(DockingDNN, self).__init__(task_types, training_params)

  def fit_on_batch(self, X, y, w):
    print "Training 3D model"
    print "Original shape of X: " + str(np.shape(X))
    print "Shuffling X dimensions to match convnet"
    # TODO(rbharath): Modify the featurization so that it matches desired shaped.
    (n_samples, axis_length, _, _, n_channels) = np.shape(X)
    X = np.reshape(X, (n_samples, axis_length, n_channels, axis_length, axis_length))
    print "Final shape of X: " + str(np.shape(X))

    print "About to fit data to model."
  model.fit(X, y, batch_size=batch_size, nb_epoch=nb_epoch)
    batch_size = self.model_params["batch_size"]
    nb_epoch = self.model_params["nb_epoch"]
    y = y.itervalues().next()
    model.train_on_batch(X, y, batch_size=batch_size, nb_epoch=nb_epoch)
    print("Finished training on batch.")

'''
def fit_3D_convolution(train_data, **training_params):
  """
  Perform stochastic gradient descent for a 3D CNN.
  """
  models = {}
  X_train = train_data["features"]
  if len(train_data["sorted_tasks"]) > 1:
    raise ValueError("3D Convolutions only supported for singletask.")
  task_name = train_data["sorted_tasks"][0]
  (y_train, _) = train_data["sorted_tasks"].itervalues().next()
  models[task_name] = train_3D_convolution(X_train, y_train, **training_params)
  return models
'''
'''
def train_3D_convolution(X, y, batch_size=50, nb_epoch=1, learning_rate=0.01,
                         loss_function="mean_squared_error"):

  """
  Fit a keras 3D CNN to datat.

  Parameters
  ----------
  nb_epoch: int
    maximal number of epochs to run the optimizer
  """
 
  return model
'''
 No newline at end of file
+15 −0
Original line number Diff line number Diff line
#from deep_chem.models.deep import SingleTaskDNN
#from deep_chem.models.deep import MultiTaskDNN
from deep_chem.models.deep3D import DockingDNN

def model_builder(model_type, task_type, training_params):
  if model_type == "singletask_deep_network":
    model = SingleTaskDNN(task_types, training_params)
  elif model_type == "multitask_deep_network":
    model = MultiTaskDNN(task_types, training_params)
  elif model_type== "3D_cnn":
    model = DockingDNN(task_types, training_params)
  else:
    #model = sklean_models(train_dict, model)
    raise ValueError("Model type not recognized.")
  return(model)
+62 −243
Original line number Diff line number Diff line
@@ -11,15 +11,20 @@ from deep_chem.utils.featurize import extract_data
from deep_chem.utils.featurize import generate_targets
from deep_chem.utils.featurize import generate_features
from deep_chem.utils.featurize import generate_vs_utils_features
from deep_chem.utils.featurize import featurize_inputs
from deep_chem.models.standard import fit_singletask_models
from deep_chem.utils.load import process_datasets
from deep_chem.utils.load import transform_data
from deep_chem.utils.evaluate import results_to_csv
from deep_chem.utils.save import save_model
from deep_chem.utils.save import load_model
from deep_chem.utils.save import save_sharded_dataset
from deep_chem.utils.save import load_sharded_dataset
from deep_chem.utils.evaluate import results_to_csv
from deep_chem.utils.evaluate import eval_trained_model
from deep_chem.utils.evaluate import compute_model_performance
from deep_chem.utils.preprocess import train_test_split
from deep_chem.utils.fit import fit_model


def add_featurization_command(subparsers):
  """Adds flags for featurize subcommand."""
@@ -75,7 +80,7 @@ def add_featurize_group(featurize_cmd):
  featurize_group.add_argument(
      "--out", required=1,
      help="Folder to generate processed dataset in.")
  featurize_group.set_defaults(func=featurize_input)
  featurize_group.set_defaults(func=featurize_inputs_wrapper)

def add_train_test_command(subparsers):
  """Adds flags for train-test-split subcommand."""
@@ -118,7 +123,7 @@ def add_train_test_command(subparsers):
  train_test_cmd.add_argument(
      "--test-out", type=str, required=1,
      help="Location to save test set.")
  train_test_cmd.set_defaults(func=train_test_input)
  train_test_cmd.set_defaults(func=train_test_split_wrapper)

def add_model_group(fit_cmd):
  """Adds flags for specifying models."""
@@ -127,13 +132,15 @@ def add_model_group(fit_cmd):
      "--model", required=1,
      choices=["logistic", "rf_classifier", "rf_regressor",
               "linear", "ridge", "lasso", "lasso_lars", "elastic_net",
               "singletask_deep_network", "multitask_deep_network", "3D_cnn"],
               "singletask_deep_classifier", "multitask_deep_classifier",
               "singletask_deep_regressor", "multitask_deep_regressor",
               "convolutional_3D_regressor"],
      help="Type of model to build. Some models may allow for\n"
           "further specification of hyperparameters. See flags below.")

  group = fit_cmd.add_argument_group("Neural Net Parameters")
  group.add_argument(
      "--n-hidden", type=int, default=500,
      "--nb-hidden", type=int, default=500,
      help="Number of hidden neurons for NN models.")
  group.add_argument(
      "--learning-rate", type=float, default=0.01,
@@ -142,7 +149,7 @@ def add_model_group(fit_cmd):
      "--dropout", type=float, default=0.5,
      help="Learning rate for NN models.")
  group.add_argument(
      "--n-epochs", type=int, default=50,
      "--nb-epoch", type=int, default=50,
      help="Number of epochs for NN models.")
  group.add_argument(
      "--batch-size", type=int, default=32,
@@ -163,10 +170,6 @@ def add_fit_command(subparsers):
  fit_cmd = subparsers.add_parser(
      "fit", help="Fit a model to training data.")
  group = fit_cmd.add_argument_group("load-and-transform")
  group.add_argument(
      "--task-type", required=1,
      choices=["classification", "regression"],
      help="Type of learning task.")
  group.add_argument(
      "--saved-data", required=1,
      help="Location of saved transformed data.")
@@ -175,7 +178,7 @@ def add_fit_command(subparsers):
  group.add_argument(
      "--saved-out", type=str, required=1,
      help="Location to save trained model.")
  fit_cmd.set_defaults(func=fit_model)
  fit_cmd.set_defaults(func=fit_model_wrapper)


def add_eval_command(subparsers):
@@ -190,44 +193,16 @@ def add_eval_command(subparsers):
  group.add_argument(
      "--saved-data", required=1, help="Location of saved transformed data.")
  group.add_argument(
      "--modeltype", required=1,
      "--model_type", required=1,
      choices=["sklearn", "keras-graph", "keras-sequential"],
      help="Type of model to load.")
  # TODO(rbharath): This argument seems a bit extraneous. Is it really
  # necessary?
  group.add_argument(
      "--task-type", required=1,
      choices=["classification", "regression"],
      help="Type of learning task.")
  group = eval_cmd.add_argument_group("Classification metrics")
  group.add_argument(
      "--compute-aucs", action="store_true", default=False,
      help="Compute AUC for trained models on test set.")
  group.add_argument(
      "--compute-accuracy", action="store_true", default=False,
      help="Compute accuracy for trained models on test set.")
  group.add_argument(
      "--compute-recall", action="store_true", default=False,
      help="Compute recall for trained models on test set.")
  group.add_argument(
      "--compute-matthews-corrcoef", action="store_true", default=False,
      help="Compute Matthews Correlation Coefficient for trained models on test set.")

  group = eval_cmd.add_argument_group("Regression metrics")
  group.add_argument(
      "--compute-r2s", action="store_true", default=False,
      help="Compute R^2 for trained models on test set.")
  group.add_argument(
      "--compute-rms", action="store_true", default=False,
      help="Compute RMS for trained models on test set.")

  eval_cmd.add_argument(
      "--csv-out", type=str, required=1,
      help="Outputted predictions on evaluated set.")
  eval_cmd.add_argument(
      "--stats-out", type=str, required=1j,
      help="Computed statistics on evaluated set.")
  eval_cmd.set_defaults(func=eval_trained_model)
  eval_cmd.set_defaults(func=eval_trained_model_wrapper)

# TODO(rbharath): There are a lot of duplicate commands introduced here. Is
# there a nice way to factor them?
@@ -272,19 +247,22 @@ def add_model_command(subparsers):
           "specified requires that split be in original data.")

  add_model_group(model_cmd)
  model_cmd.add_argument(
      "--task-type", default="classification",
      choices=["classification", "regression"],
      help="Type of learning task.")
  model_cmd.set_defaults(func=create_model)

def extract_training_params(args):
  params = ["nb_hidden", "learning_rate", "dropout",
            "nb_epoch", "decay", "batch_size", "loss_function"]

  training_params = {param : getattr(args, param) for param in params}
  return(training_params)

def create_model(args):
  """Creates a model"""
  data_dir = os.path.join(args.out, args.name)
  print("+++++++++++++++++++++++++++++++++")
  print("Perform featurization")
  if not args.skip_featurization:
    _featurize_input(
    featurize_inputs(
        args.name, args.out, args.input_file, args.input_type, args.fields,
        args.field_types, args.feature_fields, args.target_fields,
        args.smiles_field, args.split_field, args.id_field, args.threshold,
@@ -296,20 +274,20 @@ def create_model(args):
  train_out = os.path.join(data_dir, "%s-train.joblib" % args.name)
  test_out = os.path.join(data_dir, "%s-test.joblib" % args.name)
  if not args.skip_train_test_split:
    _train_test_input(
    train_test_split(
        paths, args.output_transforms, args.input_transforms, args.feature_types,
        args.splittype, args.mode, train_out, test_out,
        args.target_fields)

  print("+++++++++++++++++++++++++++++++++")
  print("Fit model")
  modeltype = get_model_type(args.model)
  extension = get_model_extension(modeltype)
  model_type = get_model_type(args.model)
  extension = get_model_extension(model_type)
  saved_out = os.path.join(data_dir, "%s.%s" % (args.model, extension))
  if not args.skip_fit:
    _fit_model(
        args.model, args.task_type, args.n_hidden, args.learning_rate,
        args.dropout, args.n_epochs, args.decay, args.batch_size, args.loss_function,
    training_params = extract_training_params(args)
    fit_model(
        args.model, training_params,
        args.validation_split, saved_out, train_out, args.target_fields)


@@ -320,24 +298,14 @@ def create_model(args):
  stats_out_train = os.path.join(data_dir, "%s-train-stats.txt" % args.name)
  csv_out_test = os.path.join(data_dir, "%s-test.csv" % args.name)
  stats_out_test = os.path.join(data_dir, "%s-test-stats.txt" % args.name)
  compute_aucs, compute_recall, compute_accuracy, compute_matthews_corrcoef = (
      False, False, False, False)
  compute_r2s, compute_rms = False, False
  if args.task_type == "classification":
    compute_aucs, compute_recall, compute_accuracy, compute_matthews_corrcoef = (
        True, True, True, True)
  elif args.task_type == "regression":
    compute_r2s, compute_rms = True, True
  _eval_trained_model(
      modeltype, saved_out, train_out, args.task_type, compute_aucs,
      compute_recall, compute_accuracy, compute_matthews_corrcoef, compute_r2s,
      compute_rms, csv_out_train, stats_out_train, args.target_fields)
  eval_trained_model(
      model_type, saved_out, train_out, csv_out_train, 
      stats_out_train, args.target_fields)
  print("Eval Model on Test")
  print("------------------")
  _eval_trained_model(
      modeltype, saved_out, test_out, args.task_type, compute_aucs,
      compute_recall, compute_accuracy, compute_matthews_corrcoef, compute_r2s,
      compute_rms, csv_out_test, stats_out_test, args.target_fields)
  eval_trained_model(
      model_type, saved_out, test_out, csv_out_test, 
      stats_out_test, args.target_fields)

def parse_args(input_args=None):
  """Parse command-line arguments."""
@@ -353,207 +321,58 @@ def parse_args(input_args=None):

  return parser.parse_args(input_args)

def featurize_input(args):
def featurize_inputs_wrapper(args):
  """Wrapper function that calls _featurize_input with args unwrapped."""
  _featurize_inputs(
  featurize_inputs(
      args.name, args.out, args.input_file, args.input_type, args.fields,
      args.field_types, args.feature_fields, args.target_fields,
      args.smiles_field, args.split_field, args.id_field, args.threshold,
      args.delimiter)


#make this helper and add a wrapper function that has "input files" and add multiprocessing option
#shard into 10x at this step (make a flag)
'''
def _featurize_input(input_file, name, out, input_type, fields, field_types,
                     feature_fields, prediction_field, smiles_field,
'''
def _featurize_input(name, out, input_file, input_type, fields, field_types,
                     feature_fields, target_fields, smiles_field,
                     split_field, id_field, threshold, delimiter):
  """Featurizes raw input data."""
  if len(fields) != len(field_types):
    raise ValueError("number of fields does not equal number of field types")
  if id_field is None:
    id_field = smiles_field
    '''
  out_x_joblib, out_y_joblib = generate_directories(name, input_file, out, feature_fields)
  df, mols = extract_data(
      input_file, input_type, fields, field_types, prediction_field,
      smiles_field, threshold, delimiter)
  print "Generating targets"
  generate_targets(df, prediction_field, split_field,
                   smiles_field, id_field, out_y_joblib)
  print "Generating user-specified features"
  generate_features(df, feature_fields, smiles_field, id_field, out_x_joblib)
  print "Generating circular fingerprints"
  generate_vs_utils_features(df, name, input_file, out, smiles_field, id_field, "fingerprints")
  print "Generating rdkit descriptors"
  generate_vs_utils_features(df, name, input_file, out, smiles_field, id_field, "descriptors")

def _featurize_inputs(name, out, input_files, input_type, fields, field_types,
                     feature_fields, prediction_field, smiles_field,
                     split_field, id_field, threshold, delimiter):
  
  other_arguments = (name, out, input_type, fields, field_types,
                     feature_fields, prediction_field, smiles_field,
                     split_field, id_field, threshold, delimiter)
  pool = mp.Pool(mp.cpu_count())
  pool.map(_featurize_input, itertools.izip(input_files, itertools.repeat(other_arguments)))
  pool.terminate()
  
  '''
  out_x_pkl, out_y_pkl = generate_directories(name, out, feature_fields)
  df, _ = extract_data(
      input_file, input_type, fields, field_types, target_fields,
      smiles_field, threshold, delimiter)
  print("Generating targets")
  generate_targets(df, target_fields, split_field,
                   smiles_field, id_field, out_y_pkl)
  print("Generating user-specified features")
  generate_features(df, feature_fields, smiles_field, id_field, out_x_pkl)
  print("Generating circular fingerprints")
  generate_vs_utils_features(df, name, out, smiles_field, id_field, "fingerprints")
  print("Generating rdkit descriptors")
  generate_vs_utils_features(df, name, out, smiles_field, id_field, "descriptors")

def train_test_input(args):
  """Wrapper function that calls _train_test_input after unwrapping args."""
  _train_test_input(
def train_test_split_wrapper(args):
  """Wrapper function that calls _train_test_split_wrapper after unwrapping args."""
  preprocess.train_test_split(
      args.paths, args.output_transforms, args.input_transforms,
      args.feature_types, args.splittype, args.mode,
      args.train_out, args.test_out, args.target_fields)

#decompose this into: a) compute train test split using only smiles.  b) for each shard, make a train test numpy array 
def _train_test_input(paths, output_transforms, input_transforms,
                      feature_types, splittype, mode,
                      train_out, test_out, target_names):
  """Saves transformed model."""
  if output_transforms == "" or output_transforms == "None":
    output_transforms = []
  else:
    output_transforms = output_transforms.split(",")
  feature_types = feature_types.split(",")
  print("About to process_dataset")
  train_dict, test_dict = process_datasets(
      paths, feature_types=feature_types, splittype=splittype,
      mode=mode, target_names=target_names)
  print("Finished process_dataset")

  print("Starting transform_data")
  trans_train_dict = transform_data(
      train_dict, input_transforms, output_transforms)
  print("Finished transform_data on train")
  trans_test_dict = transform_data(test_dict, input_transforms, output_transforms)
  print("Finished transform_data on test")
  transforms = {"input_transforms": input_transforms,
                "output_transform": output_transforms}
  stored_train = {"raw": train_dict,
                  "transformed": trans_train_dict,
                  "transforms": transforms}
  stored_test = {"raw": test_dict,
                 "transformed": trans_test_dict,
                 "transforms": transforms}
  print("About to save dataset..")
  save_sharded_dataset(stored_train, train_out)
  save_sharded_dataset(stored_test, test_out)

def _train_test_inputs(paths, output_transforms, input_transforms,
                      feature_types, splittype, weight_positives, mode,
                      train_out, test_out):

def fit_model(args):
def fit_model_wrapper(args):
  """Wrapper that calls _fit_model with arguments unwrapped."""
  # TODO(rbharath): Bundle these arguments up into a training_params dict.
  _fit_model(
      args.model, args.task_type, args.n_hidden,
      args.learning_rate, args.dropout, args.n_epochs, args.decay,
      args.batch_size, args.loss_function, args.validation_split,
  training_params = extract_training_params(args)
  fit_model(
      args.model, training_params, args.validation_split,
      args.saved_out, args.saved_data, args.target_fields)

def _fit_model(model, task_type, n_hidden, learning_rate, dropout,
               n_epochs, decay, batch_size, loss_function, validation_split, saved_out,
               saved_data, target_names):
  """Builds model from featurized data."""
  task_types = {target: task_type for target in target_names}

  stored_train = load_sharded_dataset(saved_data)
  train_dict = stored_train["transformed"]

  if model == "singletask_deep_network":
    from deep_chem.models.deep import fit_singletask_mlp
    models = fit_singletask_mlp(
        train_dict, task_types, n_hidden=n_hidden, learning_rate=learning_rate,
        dropout=dropout, nb_epoch=n_epochs, decay=decay, batch_size=batch_size,
        validation_split=validation_split)
  elif model == "multitask_deep_network":
    from deep_chem.models.deep import fit_multitask_mlp
    models = fit_multitask_mlp(
        train_dict, task_types, n_hidden=n_hidden, learning_rate=learning_rate,
        dropout=dropout, batch_size=batch_size, nb_epoch=n_epochs, decay=decay,
        validation_split=validation_split)
  elif model == "3D_cnn":
    from deep_chem.models.deep3d import fit_3D_convolution
    models = fit_3D_convolution(
        train_dict, nb_epoch=n_epochs, batch_size=batch_size,
        learning_rate=learning_rate, loss_function=loss_function)
  else:
    models = fit_singletask_models(train_dict, model)
  modeltype = get_model_type(model)
  save_model(models, modeltype, saved_out)


def get_model_type(model):
  """Associate each model with a modeltype (used for saving/loading)."""
  """Associate each model with a model_type (used for saving/loading)."""
  if model in ["singletask_deep_network", "multitask_deep_network"]:
    modeltype = "keras-graph"
    model_type = "keras-graph"
  elif model in ["3D_cnn"]:
    modeltype = "keras-sequential"
    model_type = "keras-sequential"
  elif model == "neural_fingerprint":
    modeltype = "autograd"
    model_type = "autograd"
  else:
    modeltype = "sklearn"
  return modeltype
    model_type = "sklearn"
  return model_type

def get_model_extension(modeltype):
def get_model_extension(model_type):
  """Get the saved filetype extension for various types of models."""
  if modeltype == "sklearn":
  if model_type == "sklearn":
    return "joblib"
  elif modeltype == "autograd":
  elif model_type == "autograd":
    return "joblib.gz"
  elif modeltype == "keras-graph" or modeltype == "keras-sequential":
  elif model_type == "keras-graph" or model_type == "keras-sequential":
    return "h5"

def eval_trained_model(args):
def eval_trained_model_wrapper(args):
  """Wrapper function that calls _eval_trained_model with unwrapped args."""
  _eval_trained_model(
      args.modeltype, args.saved_model, args.saved_data,
      args.task_type, args.compute_aucs, args.compute_recall,
      args.compute_accuracy, args.compute_matthews_corrcoef, args.compute_r2s,
      args.compute_rms, args.csv_out, args.stats_out,
      args.target_fields)
  eval_trained_model(
      args.model_type, args.saved_model, args.saved_data,
      args.csv_out, args.stats_out, args.target_fields)


def _eval_trained_model(modeltype, saved_model, saved_data, task_type,
                        compute_aucs, compute_recall, compute_accuracy,
                        compute_matthews_corrcoef, compute_r2s, compute_rms,
                        csv_out, stats_out, target_names):
  """Evaluates a trained model on specified data."""
  model = load_model(modeltype, saved_model)
  task_types = {target: task_type for target in target_names}

  stored_test = load_sharded_dataset(saved_data)
  test_dict = stored_test["transformed"]
  raw_test_dict = stored_test["raw"]
  output_transforms = stored_test["transforms"]["output_transform"]

  with open(stats_out, "wb") as stats_file:
    results, _, _, _ = compute_model_performance(
        raw_test_dict, test_dict, task_types, model, modeltype,
        output_transforms, aucs=compute_aucs, r2s=compute_r2s, rms=compute_rms,
        recall=compute_recall, accuracy=compute_accuracy,
        mcc=compute_matthews_corrcoef, print_file=stats_file)
  with open(stats_out, "r") as stats_file:
    print(stats_file.read())
  results_to_csv(results, csv_out, task_type=task_type)

def main():
  """Invokes argument parser."""
+21 −0

File changed.

Preview size limit exceeded, changes collapsed.

Loading