Commit 67a19613 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Added first version of model command. Need to wire in connections to make it run.

parent b5a1f89e
Loading
Loading
Loading
Loading
+156 −91
Original line number Diff line number Diff line
@@ -19,12 +19,7 @@ from deep_chem.utils.save import save_model
from deep_chem.utils.save import load_model
from deep_chem.utils.evaluate import compute_model_performance

def parse_args(input_args=None):
  """Parse command-line arguments."""
  parser = argparse.ArgumentParser()
  subparsers = parser.add_subparsers(title='Modes')
 
  # FEATURIZE FLAGS
def add_featurization_command(subparsers):
  featurize_cmd = subparsers.add_parser("featurize",
      help="Featurize raw input data.")
  featurize_cmd.add_argument("--input-file", required=1,
@@ -61,7 +56,7 @@ def parse_args(input_args=None):
      help="Folder to generate processed dataset in.")
  featurize_cmd.set_defaults(func=featurize_input)

  # Train/Test Splits flag 
def add_train_test_command(subparsers):
  train_test_cmd = subparsers.add_parser("train-test-split",
      help="Apply standard data transforms to raw features generated by featurize,\n"
           "then split data into train/test and store data as (X,y) matrices.")
@@ -72,15 +67,13 @@ def parse_args(input_args=None):
      choices=["log", "normalize"],
      help="Transforms to apply to output data.")
  train_test_cmd.add_argument("--feature-types", nargs="+", required=1,
                      help="Types of featurizations to use.\n"
                           "Each featurization must correspond to subdirectory in\n"
                           "generated data directory.")
      help="Types of featurizations to use. Each featurization must correspond\n"
           "to subdirectory in generated data directory.")
  train_test_cmd.add_argument("--paths", nargs="+", required=1,
      help="Paths to input datasets.")
  train_test_cmd.add_argument("--splittype", type=str, default="scaffold",
      choices=["scaffold", "random", "specified"],
                       help="Type of train/test data-splitting.\n"
                            "scaffold uses Bemis-Murcko scaffolds.\n"
      help="Type of train/test data-splitting. 'scaffold' uses Bemis-Murcko scaffolds.\n"
           "specified requires that split be in original data.")
  train_test_cmd.add_argument("--weight-positives", type=bool, default=False,
      help="Weight positive examples to have same total weight as negatives.")
@@ -93,20 +86,8 @@ def parse_args(input_args=None):
      help="Location to save test set.")
  train_test_cmd.set_defaults(func=train_test_input)

  # TRAIN FLAGS
  train_cmd = subparsers.add_parser("fit",
                  help="Fit a model to training data.")
  group = train_cmd.add_argument_group("load-and-transform")
  group.add_argument("--task-type", default="classification",
                      choices=["classification", "regression"],
                      help="Type of learning task.")
  group.add_argument("--saved-data", required=1,
                     help="Location of saved transformed data.")
  # TODO(rbharath): CODE SMELL. This shouldn't be shuttled around
  group.add_argument("--paths", nargs="+", required=1,
                      help="Paths to input datasets.")

  group = train_cmd.add_argument_group("model")
def add_model_arguments(fit_cmd):
  group = fit_cmd.add_argument_group("model")
  group.add_argument("--model", required=1,
      choices=["logistic", "rf_classifier", "rf_regressor",
      "linear", "ridge", "lasso", "lasso_lars", "elastic_net",
@@ -115,7 +96,7 @@ def parse_args(input_args=None):
      help="Type of model to build. Some models may allow for\n"
           "further specification of hyperparameters. See flags below.")

  group = train_cmd.add_argument_group("Neural Net Parameters")
  group = fit_cmd.add_argument_group("Neural Net Parameters")
  group.add_argument("--n-hidden", type=int, default=500,
      help="Number of hidden neurons for NN models.")
  group.add_argument("--learning-rate", type=float, default=0.01,
@@ -131,11 +112,29 @@ def parse_args(input_args=None):
  group.add_argument("--validation-split", type=float, default=0.0,
      help="Percent of training data to use for validation.")

  group = train_cmd.add_argument_group("save")

def add_fit_command(subparsers):
  # TRAIN FLAGS
  fit_cmd = subparsers.add_parser("fit",
      help="Fit a model to training data.")
  group = fit_cmd.add_argument_group("load-and-transform")
  group.add_argument("--task-type", default="classification",
      choices=["classification", "regression"],
      help="Type of learning task.")
  group.add_argument("--saved-data", required=1,
      help="Location of saved transformed data.")
  # TODO(rbharath): CODE SMELL. This shouldn't be shuttled around
  group.add_argument("--paths", nargs="+", required=1,
                      help="Paths to input datasets.")

  add_model_arguments(fit_cmd)
  group = fit_cmd.add_argument_group("save")
  group.add_argument("--saved-out", type=str, required=1,
      help="Location to save trained model.")
  train_cmd.set_defaults(func=fit_model)
  fit_cmd.set_defaults(func=fit_model)


def add_eval_command(subparsers):
  eval_cmd = subparsers.add_parser("eval",
      help="Evaluate trained model on test data processed by transform.")
  group = eval_cmd.add_argument_group("load model/data")
@@ -174,6 +173,72 @@ def parse_args(input_args=None):
      help="Outputted predictions on the test set.")
  eval_cmd.set_defaults(func=eval_trained_model)

# TODO(rbharath): There are a lot of duplicate commands introduced here. Is
# there a nice way to factor them?
def add_model_command(subparsers):
  model_cmd = subparsers.add_parser("model",
      help="Combines featurize, train-test-split, fit, eval into one command\n"
           "for user convenience.")
  featurize_group = model_cmd.add_argument_group("featurize")
  featurize_group.add_argument("--input-file", required=1,
      help="Input file with data.")
  featurize_group.add_argument("--fields", required=1, nargs="+",
      help = "Names of fields. Fields correspond to columns in csv files,\n"
             "and to molecular property names for SDF files.")
  featurize_group.add_argument("--field-types", required=1, nargs="+",
      choices=["string", "float", "list-string", "list-float", "ndarray"],
      help="Type of data in fields.")
  featurize_group.add_argument("--feature-fields", type=str, nargs="+",
      help="Optional endpoint that holds pre-computed feature vector")
  featurize_group.add_argument("--prediction-field", type=str, required=1,
      help="Name of measured endpoint to predict.")
  featurize_group.add_argument("--split-field", type=str, default=None,
      help="Name of endpoint specifying train/test split.")
  featurize_group.add_argument("--smiles-field", type=str, default="smiles",
      help="Name of endpoint specifying SMILES for molecule.")
  featurize_group.add_argument("--id-field", type=str, default=None,
      help="Name of endpoint specifying unique identifier for molecule.\n"
           "If none is specified, then smiles-endpoint is used as identifier.")
  featurize_group.add_argument("--feature-types", nargs="+", required=1,
      help="Types of featurizations to use. Each featurization must correspond\n"
           "to subdirectory in generated data directory.")
  featurize_group.add_argument("--out", required=1,
      help="Folder to generate processed dataset in.")

  train_test_group = model_cmd.add_argument_group("train_test_group")
  train_test_group.add_argument("--input-transforms", nargs="+", default=[],
      choices=["normalize-and-truncate"],
      help="Transforms to apply to input data.")
  train_test_group.add_argument("--output-transforms", nargs="+", default=[],
      choices=["log", "normalize"],
      help="Transforms to apply to output data.")
  train_test_group.add_argument("--mode", default="singletask",
      choices=["singletask", "multitask"],
      help="Type of model being built.")
  train_test_group.add_argument("--splittype", type=str, default="scaffold",
      choices=["scaffold", "random", "specified"],
      help="Type of train/test data-splitting. 'scaffold' uses Bemis-Murcko scaffolds.\n"
           "specified requires that split be in original data.")

  add_model_arguments(model_cmd)
  model_cmd.add_argument("--task-type", default="classification",
      choices=["classification", "regression"],
      help="Type of learning task.")
  model_cmd.add_argument("--csv-out", type=str, default=None,
      help="Outputted predictions on the test set.")

def parse_args(input_args=None):
  """Parse command-line arguments."""
  parser = argparse.ArgumentParser()
  subparsers = parser.add_subparsers(title='Modes')

  add_featurization_command(subparsers)
  add_train_test_command(subparsers)
  add_fit_command(subparsers)
  add_eval_command(subparsers)

  add_model_command(subparsers)

  return parser.parse_args(input_args)

def featurize_input(args):