Commit 241c90a3 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Now saving train/test data separately. Allows for eval of model on train and on test separately.

parent 5d54bc74
Loading
Loading
Loading
Loading
+5 −7
Original line number Diff line number Diff line
@@ -10,7 +10,7 @@ from keras.optimizers import SGD
from deep_chem.utils.preprocess import to_one_hot


def fit_multitask_mlp(per_task_data, task_types, **training_params):
def fit_multitask_mlp(train_data, task_types, **training_params):
  """
  Perform stochastic gradient descent optimization for a keras multitask MLP.
  Returns AUCs, R^2 scores, and RMS values.
@@ -26,13 +26,12 @@ def fit_multitask_mlp(per_task_data, task_types, **training_params):
  models = {}
  # Follows convention from process_datasets that the data for multitask models
  # is grouped under key "all"
  (_, X_train, y_train, W_train), (test, X_test, y_test, W_test) = (
      per_task_data["all"])
  (_, X_train, y_train, W_train) = train_data["all"]
  models["all"] = train_multitask_model(X_train, y_train, W_train, task_types,
                                **training_params)
  return models

def fit_singletask_mlp(per_task_data, task_types, **training_params):
def fit_singletask_mlp(train_data, task_types, **training_params):
  """
  Perform stochastic gradient descent optimization for a keras MLP.

@@ -46,11 +45,10 @@ def fit_singletask_mlp(per_task_data, task_types, **training_params):
    Aggregates keyword parameters to pass to train_multitask_model
  """
  models = {}
  for index, target in enumerate(sorted(per_task_data.keys())):
  for index, target in enumerate(sorted(train_data.keys())):
    print "Training model %d" % index
    print "Target %s" % target
    (train_ids, X_train, y_train, W_train), (test, X_test, y_test, W_test) = (
        per_task_data[target])
    (train_ids, X_train, y_train, W_train) = train_data[target]
    print "%d compounds in Train" % len(train_ids)
    print "%d compounds in Test" % len(test)
    models[target] = train_multitask_model(X_train, y_train, W_train,
+3 −4
Original line number Diff line number Diff line
@@ -13,7 +13,7 @@ from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LassoLarsCV
from sklearn.svm import SVR

def fit_singletask_models(per_task_data, modeltype, task_types):
def fit_singletask_models(train_data, modeltype, task_types):
  """Fits singletask linear regression models to potency.

  Parameters
@@ -34,10 +34,9 @@ def fit_singletask_models(per_task_data, modeltype, task_types):
    None or "log". Only for regression outputs.
  """
  models = {}
  for index, target in enumerate(sorted(per_task_data.keys())):
  for index, target in enumerate(sorted(train_data.keys())):
    print "Building model %d" % index
    (_, X_train, y_train, W_train), (test, X_test, y_test, W_test) = (
        per_task_data[target])
    (_, X_train, y_train, W_train) = train_data[target]
    if modeltype == "rf_regressor":
      model = RandomForestRegressor(n_estimators=500, n_jobs=-1,
          warm_start=True, max_features="sqrt")
+19 −15
Original line number Diff line number Diff line
@@ -10,9 +10,6 @@ from deep_chem.utils.featurize import extract_data
from deep_chem.utils.featurize import generate_targets
from deep_chem.utils.featurize import generate_features
from deep_chem.utils.featurize import generate_vs_utils_features
from deep_chem.models.deep import fit_singletask_mlp
from deep_chem.models.deep import fit_multitask_mlp
from deep_chem.models.deep3d import fit_3D_convolution
from deep_chem.models.standard import fit_singletask_models
from deep_chem.utils.load import get_target_names
from deep_chem.utils.load import process_datasets
@@ -63,7 +60,7 @@ def parse_args(input_args=None):
  featurize_cmd.set_defaults(func=featurize_input)

  # TRANSFORM FLAGS
  train_test_cmd = subparsers.add_parser("train-test",
  train_test_cmd = subparsers.add_parser("train-test-split",
                      help="Apply standard data transforms to raw features generated by featurize,\n"
                           "then split data into train/test and store data as (X,y) matrices.")
  train_test_cmd.add_argument("--input-transforms", nargs="+", default=[],
@@ -88,8 +85,10 @@ def parse_args(input_args=None):
  train_test_cmd.add_argument("--mode", default="singletask",
                      choices=["singletask", "multitask"],
                      help="Type of model being built.")
  train_test_cmd.add_argument("--out", type=str, required=1,
                     help="Location to save transformed mode.")
  train_test_cmd.add_argument("--train-out", type=str, required=1,
                     help="Location to save train set.")
  train_test_cmd.add_argument("--test-out", type=str, required=1,
                     help="Location to save test set.")
  train_test_cmd.set_defaults(func=train_test_input)

  # TRAIN FLAGS
@@ -192,12 +191,14 @@ def train_test_input(args):
  """Saves transformed model."""
  targets = get_target_names(args.paths)
  output_transforms = {target: args.output_transforms for target in targets}
  per_task_data = process_datasets(args.paths,
  train_dict, test_dict= process_datasets(args.paths,
      args.input_transforms, output_transforms, feature_types=args.feature_types, 
      splittype=args.splittype, weight_positives=args.weight_positives,
      mode=args.mode)
  with gzip.open(args.out, "wb") as f:
    pickle.dump(per_task_data, f)
  with gzip.open(args.train_out, "wb") as f:
    pickle.dump(train_dict, f)
  with gzip.open(args.test_out, "wb") as f:
    pickle.dump(test_dict, f)

def fit_model(args):
  """Builds model from featurized data."""
@@ -205,24 +206,27 @@ def fit_model(args):
  task_types = {target: args.task_type for target in targets}

  with gzip.open(args.saved_data) as f:
    per_task_data = pickle.load(f)
    train_dict = pickle.load(f)

  if args.model == "singletask_deep_network":
    models = fit_singletask_mlp(per_task_data, task_types, n_hidden=args.n_hidden,
    from deep_chem.models.deep import fit_singletask_mlp
    models = fit_singletask_mlp(train_dict, task_types, n_hidden=args.n_hidden,
      learning_rate=args.learning_rate, dropout=args.dropout,
      nb_epoch=args.n_epochs, decay=args.decay, batch_size=args.batch_size,
      validation_split=args.validation_split)
  elif args.model == "multitask_deep_network":
    models = fit_multitask_mlp(per_task_data, task_types,
    from deep_chem.models.deep import fit_multitask_mlp
    models = fit_multitask_mlp(train_dict, task_types,
      n_hidden=args.n_hidden, learning_rate = args.learning_rate,
      dropout = args.dropout, batch_size=args.batch_size,
      nb_epoch=args.n_epochs, decay=args.decay,
      validation_split=args.validation_split)
  elif args.model == "3D_cnn":
    from deep_chem.models.deep3d import fit_3D_convolution
    models = fit_3D_convolution(train_data, test_data, task_types,
        nb_epoch=args.n_epochs, batch_size=args.batch_size)
  else:
    models = fit_singletask_models(per_task_data, args.model, task_types)
    models = fit_singletask_models(train_dict, args.model, task_types)
  if args.model in ["singletask_deep_network", "multitask_deep_network", "3D_cnn"]:
    modeltype = "keras"
  else:
@@ -235,9 +239,9 @@ def eval_trained_model(args):
  task_types = {target: args.task_type for target in targets}

  with gzip.open(args.saved_data) as f:
    per_task_data = pickle.load(f)
    test_dict = pickle.load(f)

  results, aucs, r2s, rms = compute_model_performance(per_task_data, task_types, model, args.modeltype,
  results, aucs, r2s, rms = compute_model_performance(test_dict, task_types, model, args.modeltype,
    args.compute_aucs, args.compute_r2s, args.compute_rms) 
  if args.csv_out is not None:
    results_to_csv(results, args.csv_out, task_type=args.task_type)
+3 −3
Original line number Diff line number Diff line
@@ -16,14 +16,14 @@ from sklearn.metrics import r2_score
from rdkit import Chem
from rdkit.Chem.Descriptors import ExactMolWt

def compute_model_performance(per_task_data, task_types, models, modeltype,
def compute_model_performance(test_data, task_types, models, modeltype,
    aucs=True, r2s=False, rms=False):
  """Computes statistics for model performance on test set."""
  all_results, auc_vals, r2_vals, rms_vals = {}, {}, {}, {}
  for index, target in enumerate(sorted(per_task_data.keys())):
  for index, target in enumerate(sorted(test_data.keys())):
    print "Evaluating model %d" % index
    print "Target %s" % target
    (train_ids, Xtrain, ytrain, wtrain), (test_ids, Xtest, ytest, wtest) = per_task_data[target]
    (test_ids, Xtest, ytest, wtest) = test_data[target]
    model = models[target]
    results = eval_model(test_ids, Xtest, ytest, wtest, model, {target: task_types[target]}, 
                         modeltype=modeltype)
+6 −7
Original line number Diff line number Diff line
@@ -38,7 +38,7 @@ def process_datasets(paths, input_transforms, output_transforms,
  """
  dataset = load_and_transform_dataset(paths, input_transforms, output_transforms,
      feature_types=feature_types, weight_positives=weight_positives)
  arrays = {}
  train_dict, test_dict = {}, {}
  if mode == "singletask":
    singletask = multitask_to_singletask(dataset)
    for target in singletask:
@@ -46,17 +46,16 @@ def process_datasets(paths, input_transforms, output_transforms,
      if len(data) == 0:
        continue
      train, test = split_dataset(dataset, splittype)
      train_data, test_data = to_arrays(train, test)
      arrays[target] = (train_data, test_data)
      train_dict[target], test_dict[target] = to_arrays(train, test)
  elif mode == "multitask":
    train, test = split_dataset(dataset, splittype)
    train_data, test_data = to_arrays(train, test)
    arrays["all"] = (train_data, test_data)
    train_dict["all"], test_dict["all"] = train_data, test_data
  else:
    raise ValueError("Unsupported mode for process_datasets.")
  print "Shape of Xtest"
  print np.shape(arrays['CANVAS-BACE'][0][1])
  return arrays
  print "Shape of Xtrain"
  print np.shape(train_dict['CANVAS-BACE'][1])
  return train_dict, test_dict 


def load_molecules(paths, feature_types=["fingerprints"]):