Commit 6daf0a73 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Added singletask/multitask support.

parent 0917ec2e
Loading
Loading
Loading
Loading
+2 −8
Original line number Diff line number Diff line
"""
Code for processing the Google vs-datasets using scikit-learn.
Code for processing datasets using scikit-learn.
"""
import numpy as np
from deep_chem.utils.load import load_and_transform_dataset
@@ -30,7 +30,7 @@ def fit_singletask_models(paths, modeltype, task_types, task_transforms,
  Parameters
  ----------
  paths: list 
    List of paths to Google vs datasets. 
    List of paths to datasets. 
  modeltype: String
    A string describing the model to be trained. Options are RandomForest,
  splittype: string
@@ -79,12 +79,6 @@ def fit_singletask_models(paths, modeltype, task_types, task_transforms,
      model = LassoLarsCV(max_iter=2000, n_jobs=-1) 
    elif modeltype == "elastic_net":
      model = ElasticNetCV(max_iter=2000, n_jobs=-1) 
    elif modeltype == "svr_rbf":
      model = SVR(kernel="rbf") 
    elif modeltype == "svr_poly":
      model = SVR(kernel="poly") 
    elif modeltype == "svr_linear":
      model = SVR(kernel="linear") 
    else:
      raise ValueError("Invalid model type provided.")
    model.fit(X_train, y_train.ravel())
+48 −39
Original line number Diff line number Diff line
@@ -27,54 +27,63 @@ from deep_chem.utils.preprocess import get_default_descriptor_transforms
def parse_args(input_args=None):
  """Parse command-line arguments."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--assay', required=1,
                      help='Assay ID.')
  parser.add_argument('--dataset', required=1, choices=['muv', 'pcba', 'dude', 'pfizer'],
  parser.add_argument('--dataset', required=1, nargs="+",
                      choices=['muv', 'pcba', 'dude', 'pfizer'],
                      help='Name of dataset to process.')
  parser.add_argument('--model', required=1, nargs="+",
                      choices=["logistic", "rf_classifier", "single_task_deep_network"])
  parser.add_argument('--model', required=1,
                      choices=["logistic", "rf_classifier", "rf_regressor",
                      "linear", "ridge", "lasso", "lasso_lars", "elastic_net",
                      "singletask_deep_network", "multitask_deep_network"])
  parser.add_argument("--splittype", type=str, default="scaffold",
                       choices=["scaffold", "random"],
                       help="Type of cross-validation data-splitting.")
  parser.add_argument("--n-hidden", type=int, default=500,
                      help="Number of hidden neurons for NN models.")
  parser.add_argument("--learning-rate", type=float, default=0.01,
                  help="Learning rate for NN models.")
  parser.add_argument("--dropout", type=float, default=0.5,
                  help="Learning rate for NN models.")
  parser.add_argument("--n-epochs", type=int, default=50,
                  help="Number of epochs for NN models.")
  parser.add_argument("--batchsize", type=int, default=32,
                  help="Number of examples per minibatch for NN models.")
  parser.add_argument("--decay", type=float, default=1e-4,
                  help="Learning rate decay for NN models.")
  parser.add_argument("--validation-split", type=float, default=0.0,
                  help="Percent of training data to use for validation.")
  return parser.parse_args(input_args)

def main():
  args = parse_args()
  if args.dataset == "muv":
    path = "/home/rbharath/vs-datasets/muv"
  elif args.dataset == "pcba":
    path = "/home/rbharath/vs-datasets/pcba"
  elif args.dataset == "dude":
    path = "/home/rbharath/vs-datasets/dude"
  # TODO(rbharath): The pfizer dataset is private. Remove this before the
  # public release of the code.
  elif args.dataset == "pfizer":
    path = "/home/rbharath/private-datasets/pfizer"
  paths = {}
  if "muv" in args.dataset:
    paths["muv"] = "/home/rbharath/vs-datasets/muv"
  elif "pcba" in args.dataset:
    paths["pcba"] = "/home/rbharath/vs-datasets/pcba"
  elif "dude" in args.dataset:
    paths["dude"]= "/home/rbharath/vs-datasets/dude"
  # TODO(rbharath): The pfizer dataset is currently private. Remove this before
  # the public release of the code.
  elif "pfizer" in args.dataset:
    paths["pfizer"] = "/home/rbharath/private-datasets/pfizer"

  task_types, task_transforms = get_default_task_types_and_transforms(
    {args.dataset: path})
  task_types, task_transforms = get_default_task_types_and_transforms(paths)
  desc_transforms = get_default_descriptor_transforms()

  if len(args.model) == 1:
    model = args.model[0]
    fit_singletask_models([path], model, task_types,
  if args.model == "singletask_deep_network":
    fit_singletask_mlp(paths.values(), task_types, task_transforms,
      desc_transforms, splittype=args.splittype, add_descriptors=False,
      n_hidden=args.n_hidden, learning_rate=args.learning_rate,
      dropout=args.dropout, nb_epoch=args.n_epochs, decay=args.decay,
      validation_split=args.validation_split)
  elif args.model == "multitask_deep_network":
    fit_multitask_mlp(paths.values(), task_types, task_transforms,
      desc_transforms, splittype=args.splittype, add_descriptors=False,
      n_hidden=args.n_hidden, learning_rate = args.learning_rate, dropout = args.dropout,
      nb_epoch=args.n_epochs, decay=args.decay, validation_split=args.validation_split)
  else:
    fit_singletask_models(paths.values(), args.model, task_types,
        task_transforms, splittype="scaffold")

  #fit_multitask_mlp([muv_path, pfizer_path], task_types, task_transforms,
  #  desc_transforms, splittype="scaffold", add_descriptors=False,
  #  desc_weight=0.1, n_hidden=500, learning_rate = .01, dropout = .5,
  #  nb_epoch=50, decay=1e-4, validation_split=0.01)

  #fit_multitask_mlp([muv_path, pfizer_path], task_types, task_transforms,
  #  desc_transforms, splittype="scaffold", add_descriptors=False, n_hidden=500,
  #  nb_epoch=40, learning_rate=0.01, decay=1e-4, dropout = .5)
  #fit_multitask_mlp([dude_path], task_types, task_transforms,
  #  desc_transforms, splittype="scaffold", add_descriptors=False, n_hidden=500,
  #  nb_epoch=40, learning_rate=0.01, decay=1e-4, dropout = .5)
  #fit_multitask_mlp([muv_path], task_types, task_transforms, desc_transforms,
  #  splittype="scaffold", add_descriptors=False, n_hidden=500,
  #  learning_rate=.01, dropout=.5, nb_epoch=30, decay=1e-4)
  #fit_singletask_mlp([muv_path], task_types, task_transforms, desc_transforms,
  #  splittype="scaffold", add_descriptors=False, n_hidden=500,
  #  learning_rate=.01, dropout=.5, nb_epoch=30, decay=1e-4)


if __name__ == "__main__":
  main()