Commit f2dca20f authored by evanfeinberg's avatar evanfeinberg
Browse files

made data munging more efficient

parent 63ada38a
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -89,7 +89,7 @@ def train_multitask_model(X, y, W, task_types,
  sorted_targets = sorted(task_types.keys())
  local_task_types = task_types.copy()
  endpoints = sorted_targets
  (_, n_inputs) = np.shape(X)
  (_, n_inputs) = np.shape(X[0].flatten())
  # Add eps weight to avoid minibatches with zero weight (causes theano to crash).
  W = W + eps * np.ones(np.shape(W))
  model = Graph()
+6 −4
Original line number Diff line number Diff line
@@ -12,12 +12,14 @@ def fit_3D_convolution(per_task_data, task_types, **training_params):
  Perform stochastic gradient descent for a 3D CNN.
  """
  models = {}
  (_, X_train, y_train, _), _ = per_task_data["all"]
  (_, X_train, y_train, _) = per_task_data.itervalues().next()
  nb_classes = 2
  models["all"] = train_3D_convolution(X_train, y_train, **training_params)
  return models

def train_3D_convolution(X, y, batch_size=50, nb_epoch=1):
def train_3D_convolution(X, y, batch_size=50, nb_epoch=1,learning_rate=0.01,
  loss_function="mean_squared_error"):

  """
  Fit a keras 3D CNN to datat.

@@ -70,9 +72,9 @@ def train_3D_convolution(X, y, batch_size=50, nb_epoch=1):
  # TODO(rbharath): Generalize this to support classification as well as regression.
  model.add(Dense(32/2, 1, init='normal'))

  sgd = RMSprop(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
  sgd = RMSprop(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True)
  print "About to compile model"
  model.compile(loss='mean_squared_error', optimizer=sgd)
  model.compile(loss=loss_function, optimizer=sgd)
  print "About to fit data to model."
  model.fit(X, y, batch_size=batch_size, nb_epoch=nb_epoch)
  return model
+25 −14
Original line number Diff line number Diff line
@@ -4,6 +4,7 @@ Top level script to featurize input, train models, and evaluate them.
import argparse
import gzip
import cPickle as pickle
import joblib
import os
from deep_chem.utils.featurize import generate_directories
from deep_chem.utils.featurize import extract_data
@@ -17,6 +18,8 @@ from deep_chem.utils.load import transform_data
from deep_chem.utils.evaluate import results_to_csv
from deep_chem.utils.save import save_model
from deep_chem.utils.save import load_model
from deep_chem.utils.save import save_sharded_dataset
from deep_chem.utils.save import load_sharded_dataset
from deep_chem.utils.evaluate import compute_model_performance

def add_featurization_command(subparsers):
@@ -145,6 +148,9 @@ def add_model_group(fit_cmd):
  group.add_argument(
      "--batch-size", type=int, default=32,
      help="Number of examples per minibatch for NN models.")
  group.add_argument(
      "--loss-function", type=str, default="mean_squared_error",
      help="Loss function type.")  
  group.add_argument(
      "--decay", type=float, default=1e-4,
      help="Learning rate decay for NN models.")
@@ -292,8 +298,8 @@ def create_model(args):
  print "Perform train-test split"
  paths = [data_dir]
  weight_positives = False  # Hard coding this for now
  train_out = os.path.join(data_dir, "%s-train.pkl.gz" % args.name)
  test_out = os.path.join(data_dir, "%s-test.pkl.gz" % args.name)
  train_out = os.path.join(data_dir, "%s-train.joblib" % args.name)
  test_out = os.path.join(data_dir, "%s-test.joblib" % args.name)
  _train_test_input(
      paths, args.output_transforms, args.input_transforms, args.feature_types,
      args.splittype, weight_positives, args.mode, train_out, test_out)
@@ -305,7 +311,7 @@ def create_model(args):
  saved_out = os.path.join(data_dir, "%s.%s" % (args.model, extension))
  _fit_model(
      paths, args.model, args.task_type, args.n_hidden, args.learning_rate,
      args.dropout, args.n_epochs, args.decay, args.batch_size,
      args.dropout, args.n_epochs, args.decay, args.batch_size, args.loss_function,
      args.validation_split, saved_out, train_out)


@@ -397,13 +403,19 @@ def _train_test_input(paths, output_transforms, input_transforms,
    output_transforms = output_transforms.split(",")
  output_transforms_dict = {target: output_transforms for target in targets}
  feature_types = feature_types.split(",")
  print("About to process_dataset")
  train_dict, test_dict = process_datasets(
      paths, input_transforms, output_transforms_dict,
      feature_types=feature_types, splittype=splittype,
      weight_positives=weight_positives, mode=mode)
  print("Finished process_dataset")

  print("Starting transform_data")
  trans_train_dict = transform_data(
      train_dict, input_transforms, output_transforms)
  print("Finished transform_data on train")
  trans_test_dict = transform_data(test_dict, input_transforms, output_transforms)
  print("Finished transform_data on test")
  transforms = {"input_transforms": input_transforms,
                "output_transform": output_transforms}
  stored_train = {"raw": train_dict,
@@ -412,10 +424,9 @@ def _train_test_input(paths, output_transforms, input_transforms,
  stored_test = {"raw": test_dict,
                 "transformed": trans_test_dict,
                 "transforms": transforms}
  with gzip.open(train_out, "wb") as train_file:
    pickle.dump(stored_train, train_file)
  with gzip.open(test_out, "wb") as test_file:
    pickle.dump(stored_test, test_file)
  print("About to save dataset..")
  save_sharded_dataset(stored_train, train_out)
  save_sharded_dataset(stored_test, test_out)

def fit_model(args):
  """Wrapper that calls _fit_model with arguments unwrapped."""
@@ -423,17 +434,17 @@ def fit_model(args):
  _fit_model(
      args.paths, args.model, args.task_type, args.n_hidden,
      args.learning_rate, args.dropout, args.n_epochs, args.decay,
      args.batch_size, args.validation_split, args.saved_out, args.saved_data)
      args.batch_size, args.loss_function,
      args.validation_split, args.saved_out, args.saved_data)

def _fit_model(paths, model, task_type, n_hidden, learning_rate, dropout,
               n_epochs, decay, batch_size, validation_split, saved_out,
               n_epochs, decay, batch_size, loss_function, validation_split, saved_out,
               saved_data):
  """Builds model from featurized data."""
  targets = get_target_names(paths)
  task_types = {target: task_type for target in targets}

  with gzip.open(saved_data) as data_file:
    stored_train = pickle.load(data_file)
  stored_train = load_sharded_dataset(saved_data)
  train_dict = stored_train["transformed"]

  if model == "singletask_deep_network":
@@ -451,7 +462,8 @@ def _fit_model(paths, model, task_type, n_hidden, learning_rate, dropout,
  elif model == "3D_cnn":
    from deep_chem.models.deep3d import fit_3D_convolution
    models = fit_3D_convolution(
        train_dict, task_types, nb_epoch=n_epochs, batch_size=batch_size)
        train_dict, task_types, nb_epoch=n_epochs, batch_size=batch_size,
        learning_rate=learning_rate,loss_function=loss_function)
  else:
    models = fit_singletask_models(train_dict, model)
  modeltype = get_model_type(model)
@@ -495,8 +507,7 @@ def _eval_trained_model(modeltype, saved_model, saved_data, paths, task_type,
  targets = get_target_names(paths)
  task_types = {target: task_type for target in targets}

  with gzip.open(saved_data) as data_file:
    stored_test = pickle.load(data_file)
  stored_test = load_sharded_dataset(saved_data)
  test_dict = stored_test["transformed"]
  raw_test_dict = stored_test["raw"]
  output_transforms = stored_test["transforms"]["output_transform"]
+2 −1
Original line number Diff line number Diff line
@@ -34,7 +34,8 @@ def compute_model_performance(raw_test_data, test_data, task_types, models,
    print("Target %s" % target, file=print_file)
    (test_ids, X_test, y_test, w_test) = test_data[target]
    (_, _, ytest_raw, _) = raw_test_data[target]
    model = models[target]
    #model = models[target]
    model = models.itervalues().next()
    results = eval_model(
        test_ids, X_test, y_test, ytest_raw, w_test, model,
        {target: task_types[target]}, modeltype=modeltype,
+12 −4
Original line number Diff line number Diff line
@@ -36,16 +36,24 @@ def process_datasets(paths, input_transforms, output_transforms,
  seed: int
    Seed used for random splits.
  """
  print("Loading dataset...")
  dataset = load_datasets(paths, feature_types=feature_types)
  print("Loaded dataset")
  train_dict, test_dict = {}, {}
  if mode == "singletask":
    singletask = multitask_to_singletask(dataset)
    for target in singletask:
      data = singletask[target]
    print("Completed multitask_to_singletask")
    for task in singletask:
      print(task)
      print("About to split dataset")
      data = singletask[task]
      if len(data) == 0:
        continue
      print("About to split train and test")
      train, test = split_dataset(dataset, splittype)
      train_dict[target], test_dict[target] = to_arrays(train, test)
      print("Done spliting train and test")
      train_dict[task], test_dict[task] = to_arrays(train, test)
      print("to_arrays is done")
  elif mode == "multitask":
    train, test = split_dataset(dataset, splittype)
    train_data, test_data = to_arrays(train, test)
@@ -140,7 +148,7 @@ def load_assays(paths, target_dir_name="targets"):
          raise ValueError("Prediction Endpoint Missing.")
        for ind, id in enumerate(contents["mol_id"]):
          measurement = contents["prediction"][ind]
          if "split" is not None:
          if "split" in contents:
            splits[id] = contents["split"][ind]
          else:
            splits[id] = None
Loading