Commit fde59eb8 authored by evanfeinberg's avatar evanfeinberg
Browse files

fixed convolutional 3D fitting and eval.

parent 719e4bc9
Loading
Loading
Loading
Loading
+25 −22
Original line number Diff line number Diff line
@@ -18,8 +18,11 @@ class DockingDNN(Model):
  Wrapper class for fitting 3D convolutional networks for deep docking.
  """
  def __init__(self, task_types, model_params, initialize_raw_model=True):
    super(DockingDNN, self).__init__(task_types, model_params, initialize_raw_model)
    if initialize_raw_model:
      (axis_length, _, _, n_channels) = model_params["data_shape"]
      self.input_shape = (n_channels, 
                          axis_length, axis_length, axis_length)

      learning_rate = model_params["learning_rate"]
      loss_function = model_params["loss_function"]
@@ -33,21 +36,21 @@ class DockingDNN(Model):
      # level of convolution to perform at each layer (CONV x CONV)
      nb_conv = [7, 5, 3]
      model = Sequential()
      model.add(Convolution3D(nb_filter=nb_filters[0], stack_size=n_channels,

      model.add(Convolution3D(nb_filter=nb_filters[0], nb_depth=nb_conv[0], 
                              nb_row=nb_conv[0], nb_col=nb_conv[0],
                              nb_depth=nb_conv[0], border_mode='valid'))
                              input_shape=self.input_shape))
      model.add(Activation('relu'))
      model.add(MaxPooling3D(poolsize=(nb_pool[0], nb_pool[0], nb_pool[0])))
      model.add(Convolution3D(nb_filter=nb_filters[1], stack_size=nb_filters[0],
                              nb_row=nb_conv[1], nb_col=nb_conv[1], nb_depth=nb_conv[1],
                              border_mode='valid'))

      model.add(MaxPooling3D(pool_size=(nb_pool[0], nb_pool[0], nb_pool[0])))
      model.add(Convolution3D(nb_filter=nb_filters[1],  nb_depth=nb_conv[1],
                              nb_row=nb_conv[1], nb_col=nb_conv[1]))
      model.add(Activation('relu'))
      model.add(MaxPooling3D(poolsize=(nb_pool[1], nb_pool[1], nb_pool[1])))
      model.add(Convolution3D(nb_filter=nb_filters[2], stack_size=nb_filters[1],
                              nb_row=nb_conv[2], nb_col=nb_conv[2],
                              nb_depth=nb_conv[2], border_mode='valid'))
      model.add(MaxPooling3D(pool_size=(nb_pool[1], nb_pool[1], nb_pool[1])))
      model.add(Convolution3D(nb_filter=nb_filters[2], nb_depth=nb_conv[2],
                              nb_row=nb_conv[2], nb_col=nb_conv[2]))
      model.add(Activation('relu'))
      model.add(MaxPooling3D(poolsize=(nb_pool[2], nb_pool[2], nb_pool[2])))
      model.add(MaxPooling3D(pool_size=(nb_pool[2], nb_pool[2], nb_pool[2])))
      model.add(Flatten())
      # TODO(rbharath): If we change away from axis-size 32, this code will break.
      # Eventually figure out a more general rule that works for all axis sizes.
@@ -60,31 +63,31 @@ class DockingDNN(Model):
      sgd = RMSprop(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True)
      print("About to compile model")
      model.compile(loss=loss_function, optimizer=sgd)
      self.model = model
    super(DockingDNN, self).__init__(task_types, model_params, initialize_raw_model)
      self.raw_model = model

  def shuffle_data(self, X):
    (n_samples, axis_length, _, _, n_channels) = np.shape(X)
    X = np.reshape(X, (n_samples, n_channels, axis_length, axis_length, axis_length))
    return X


  def fit_on_batch(self, X, y, w):
    print("Training 3D model")
    print("Original shape of X: " + str(np.shape(X)))
    print("Shuffling X dimensions to match convnet")
    # TODO(rbharath): Modify the featurization so that it matches desired shaped.
    (n_samples, axis_length, _, _, n_channels) = np.shape(X)
    X = np.reshape(X, (n_samples, axis_length, n_channels, axis_length, axis_length))
    X = self.shuffle_data(X)
    print("Final shape of X: " + str(np.shape(X)))

    print("About to fit data to model.")
    batch_size = self.model_params["batch_size"]
    nb_epoch = self.model_params["nb_epoch"]
    y = y.itervalues().next()
    self.raw_model.train_on_batch(X, y, batch_size=batch_size, nb_epoch=nb_epoch)
    self.raw_model.train_on_batch(X, y)
    print("Finished training on batch.")

  def predict_on_batch(self, X):
    if len(np.shape(X)) != 5:
      raise ValueError(
          "Tensorial datatype must be of shape (n_samples, N, N, N, n_channels).")
    (n_samples, axis_length, _, _, n_channels) = np.shape(X)
    X = np.reshape(X, (n_samples, axis_length, n_channels, axis_length, axis_length))
    X = self.shuffle_data(X)
    y_pred = self.raw_model.predict_on_batch(X)
    y_pred = np.squeeze(y_pred)
    return(y_pred)
    return y_pred
+6 −34
Original line number Diff line number Diff line
@@ -164,10 +164,6 @@ def add_eval_command(subparsers):
      help="Location from which to load saved model.")
  group.add_argument(
      "--saved-data", required=1, help="Location of saved transformed data.")
  group.add_argument(
      "--model_type", required=1,
      choices=["sklearn", "keras-graph", "keras-sequential"],
      help="Type of model to load.")
  eval_cmd.add_argument(
      "--csv-out", type=str, required=1,
      help="Outputted predictions on evaluated set.")
@@ -267,7 +263,6 @@ def create_model(args):

  print("+++++++++++++++++++++++++++++++++")
  print("Fit model")
  model_type = get_model_type(args.model)
  if not args.skip_fit:
    model_params = extract_model_params(args)
    fit_model(
@@ -281,15 +276,13 @@ def create_model(args):
  csv_out_test = os.path.join(data_dir, "test.csv")
  stats_out_test = os.path.join(data_dir, "test-stats.txt")
  eval_trained_model(
      model_type, model_dir, data_dir, csv_out_train,

      stats_out_train, args.task_fields, split="train")
      model_name, model_dir, data_dir, csv_out_train,
      stats_out_train, split="train")
  print("Eval Model on Test")
  print("------------------")
  eval_trained_model(
      model_type, model_dir, data_dir, csv_out_test,

      stats_out_test, args.task_fields, split="test")
      model_name, model_dir, data_dir, csv_out_test,
      stats_out_test, split="test")

def parse_args(input_args=None):
  """Parse command-line arguments."""
@@ -328,32 +321,11 @@ def fit_model_wrapper(args):
  fit_model(
      args.model_name, model_params, args.model_dir, args.data_dir)

def get_model_type(model):
  """Associate each model with a model_type (used for saving/loading)."""
  if model in ["singletask_deep_network", "multitask_deep_network"]:
    model_type = "keras-graph"
  elif model in ["3D_cnn"]:
    model_type = "keras-sequential"
  elif model == "neural_fingerprint":
    model_type = "autograd"
  else:
    model_type = "sklearn"
  return model_type

def get_model_extension(model_type):
  """Get the saved filetype extension for various types of models."""
  if model_type == "sklearn":
    return "joblib"
  elif model_type == "autograd":
    return "joblib.gz"
  elif model_type == "keras-graph" or model_type == "keras-sequential":
    return "h5"

def eval_trained_model_wrapper(args):
  """Wrapper function that calls _eval_trained_model with unwrapped args."""
  eval_trained_model(
      args.model_type, args.saved_model, args.data_dir,
      args.csv_out, args.stats_out, args.task_fields)
      args.model, args.model_dir, args.data_dir,
      args.csv_out, args.stats_out, split="test")

def main():
  """Invokes argument parser."""
+15 −6
Original line number Diff line number Diff line
@@ -10,6 +10,7 @@ import warnings
#from deep_chem.utils.preprocess import undo_transform_outputs
from deep_chem.utils.preprocess import get_metadata_filename
from deep_chem.utils.preprocess import get_sorted_task_names
from deep_chem.utils.preprocess import get_task_type
from deep_chem.utils.save import load_model
from deep_chem.utils.save import load_sharded_dataset
from sklearn.metrics import mean_squared_error
@@ -24,10 +25,11 @@ __author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2015, Stanford University"
__license__ = "LGPL"

def eval_trained_model(model_type, model_dir, data_dir, task_type,
def eval_trained_model(model_name, model_dir, data_dir,
                       csv_out, stats_out, split="test"):
  """Evaluates a trained model on specified data."""
  model = load_model(model_type, model_dir)
  model = load_model(model_name, model_dir)
  task_type = get_task_type(model_name)
  task_names, pred_y_df = compute_y_pred(model, data_dir, csv_out, split)
  compute_model_performance(pred_y_df, task_names, task_type, stats_out)

@@ -37,26 +39,28 @@ def compute_y_pred(model, data_dir, csv_out, split):
  """
  metadata_filename = get_metadata_filename(data_dir)
  metadata_df = load_sharded_dataset(metadata_filename)
  task_names = get_sorted_task_names(metadata_df)
  task_names = metadata_df.iterrows().next()[1]['task_names']
  pred_task_names = ["%s_pred" % task_name for task_name in task_names]
  w_task_names = ["%s_weight" % task_name for task_name in task_names]
  column_names = ['ids'] + task_names + pred_task_names + w_task_names
  pred_y_df = pd.DataFrame(columns=column_names)

  for row in metadata_df.iterrows():
  for _, row in metadata_df.iterrows():
    if row['split'] == split:
      X = load_sharded_dataset(row['X'])
      y_pred = model.predict_on_batch(X)
      y = load_sharded_dataset(row['y'])
      w = load_sharded_dataset(row['w'])
      ids = load_sharded_dataset(row['ids'])

      y_pred = model.predict_on_batch(X)
      y_pred = np.reshape(y_pred, np.shape(y))

      mini_df = pd.DataFrame(columns=column_names)
      mini_df['ids'] = ids
      mini_df[task_names] = y
      mini_df[pred_task_names] = y_pred
      mini_df[w_task_names] = w
      pred_y_df = pd.concat(pred_y_df, mini_df)
      pred_y_df = pd.concat([pred_y_df, mini_df])

  print("Saving predictions to %s" % csv_out)
  pred_y_df.to_csv(csv_out)
@@ -72,6 +76,8 @@ def compute_model_performance(pred_y_df, task_names, task_type, stats_file):
    colnames = ["task_name", "roc_auc_score", "matthews_corrcoef", "recall_score", "accuracy_score"]
  elif task_type == "regression":
    colnames = ["task_name", "r2_score", "rms_error"]
  else:
    raise ValueError("Unrecognized task type: %s" % task_type)

  performance_df = pd.DataFrame(columns=colnames)

@@ -97,6 +103,9 @@ def compute_model_performance(pred_y_df, task_names, task_type, stats_file):
  performance_df.to_csv(stats_file)
  print("Saved.")
  
  print("Model performance scores:")
  print(performance_df)

#TODO(enf/rhbarath): This might work, this might be broken.
def compute_roc_auc_scores(y, y_pred, w):
  """Transforms the results dict into roc-auc-scores and prints scores.
+1 −1
Original line number Diff line number Diff line
@@ -38,6 +38,6 @@ def fit_model(model_name, model_params, model_dir, data_dir):
    y = load_sharded_dataset(row['y'])
    w = load_sharded_dataset(row['w'])

    model.train_on_batch(X, y, w)
    model.fit_on_batch(X, y, w)

  save_model(model, model_name, model_dir)
+4 −4
Original line number Diff line number Diff line
@@ -16,15 +16,15 @@ __author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2015, Stanford University"
__license__ = "LGPL"

def get_task_type(model_type):
def get_task_type(model_name):
  """
  Given model type, determine if classifier or regressor.
  """
  if model_type in ["logistic", "rf_classifier", "singletask_deep_classifier",
  if model_name in ["logistic", "rf_classifier", "singletask_deep_classifier",
                    "multitask_deep_classifier"]:
    return "classifier"
    return "classification"
  else:
    return "regressor"
    return "regression"

def get_train_test_files(paths, train_proportion=0.8):
  """
Loading