Commit c422e15f authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

First stab at an added transform command.

parent c2c3cd44
Loading
Loading
Loading
Loading
+46 −55
Original line number Diff line number Diff line
@@ -61,30 +61,42 @@ def parse_args(input_args=None):
                      help="Folder to generate processed dataset in.")
  featurize_cmd.set_defaults(func=featurize_input)

  # TRAIN FLAGS
  train_cmd = subparsers.add_parser("train",
                  help="Train a model on specified data.")
  group = train_cmd.add_argument_group("load-and-transform")
  group.add_argument("--task-type", default="classification",
                      choices=["classification", "regression"],
                      help="Type of learning task.")
  group.add_argument("--input-transforms", nargs="+", default=[],
  # TRANSFORM FLAGS
  transform_cmd = subparsers.add_parser("transform",
                      help="Apply standard data transforms to raw features generated by featurize,\n"
                           "then split data into train/test and store data as (X,y) matrices.")
  transform_cmd.add_argument("--input-transforms", nargs="+", default=[],
                      choices=["normalize-and-truncate"],
                      help="Transforms to apply to input data.")
  group.add_argument("--output-transforms", nargs="+", default=[],
  transform_cmd.add_argument("--output-transforms", nargs="+", default=[],
                      choices=["log", "normalize"],
                      help="Transforms to apply to output data.")
  group.add_argument("--feature-types", nargs="+", required=1,
  transform_cmd.add_argument("--feature-types", nargs="+", required=1,
                      help="Types of featurizations to use.\n"
                           "Each featurization must correspond to subdirectory in\n"
                           "generated data directory.")
  group.add_argument("--paths", nargs="+", required=1,
  transform_cmd.add_argument("--paths", nargs="+", required=1,
                      help="Paths to input datasets.")
  group.add_argument("--splittype", type=str, default="scaffold",
  transform_cmd.add_argument("--splittype", type=str, default="scaffold",
                       choices=["scaffold", "random", "specified"],
                       help="Type of train/test data-splitting.\n"
                            "scaffold uses Bemis-Murcko scaffolds.\n"
                            "specified requires that split be in original data.")
  transform_cmd.add_argument("--weight-positives", type=bool, default=False,
                  help="Weight positive examples to have same total weight as negatives.")
  transform_cmd.add_argument("--out", type=str, required=1,
                     help="Location to save transformed mode.")
  transform_cmd.set_defaults(func=transform_input)

  # TRAIN FLAGS
  train_cmd = subparsers.add_parser("train",
                  help="Train a model on specified data.")
  group = train_cmd.add_argument_group("load-and-transform")
  group.add_argument("--task-type", default="classification",
                      choices=["classification", "regression"],
                      help="Type of learning task.")
  group.add_argument("--in", required=1,
                     help="Location of saved transformed data.")

  group = train_cmd.add_argument_group("model")
  group.add_argument("--mode", default="singletask",
@@ -113,8 +125,6 @@ def parse_args(input_args=None):
                  help="Learning rate decay for NN models.")
  group.add_argument("--validation-split", type=float, default=0.0,
                  help="Percent of training data to use for validation.")
  group.add_argument("--weight-positives", type=bool, default=False,
                  help="Weight positive examples to have same total weight as negatives.")

  group = train_cmd.add_argument_group("save")
  group.add_argument("--saved-out", type=str, required=1,
@@ -124,8 +134,10 @@ def parse_args(input_args=None):
  eval_cmd = subparsers.add_parser("eval",
                help="Evaluate trained model on specified data.")
  group = eval_cmd.add_argument_group("load model/data")
  group.add_argument("--saved-in", type=str, required=1,
  group.add_argument("--saved-model", type=str, required=1,
                  help="Location from which to load saved model.")
  group.add_argument("--in", required=1,
                     help="Location of saved transformed data.")
  group.add_argument("--modeltype", required=1,
                      choices=["sklearn", "keras"],
                      help="Type of model to load.")
@@ -133,40 +145,11 @@ def parse_args(input_args=None):
  group.add_argument("--mode", default="singletask",
                      choices=["singletask", "multitask"],
                      help="Type of model being built.")

  # TODO(rbharath): EXTREMELY AWKWARD!!! Both the train and evaluation have to
  # specify the set of input/output transforms desired. This seems like a major
  # API smell with many, many potentials for buginess. I think the right step
  # here is to add a new global sub-command "transform" which performs
  # data-transforms upon the input data to generate train/test splits.
  group = eval_cmd.add_argument_group("load-and-transform")
  # TODO(rbharath): This argument seems a bit extraneous. Is it really
  # necessary?
  group.add_argument("--task-type", default="classification",
                      choices=["classification", "regression"],
                      help="Type of learning task.")
  group.add_argument("--input-transforms", nargs="+", default=[],
                      choices=["normalize-and-truncate"],
                      help="Transforms to apply to input data.")
  group.add_argument("--output-transforms", nargs="+", default=[],
                      choices=["log", "normalize"],
                      help="Transforms to apply to output data.")
  group.add_argument("--feature-types", nargs="+", required=1,
                      help="Types of featurizations to use.\n"
                           "Each featurization must correspond to subdirectory in\n"
                           "generated data directory.")
  group.add_argument("--paths", nargs="+", required=1,
                      help="Paths to evaluation datasets.")
  # TODO(rbharath): There is something awkward here in that we shouldn't have
  # to specify a split to obtain the test-set right? But I'm not sure what the
  # better method is here sicne often the test-set isn't actually stratified
  # out. When we are doing featurization, should we actually do a hard split
  # and write train/test to separate locations? That might actually the more
  # elegant path. 
  group.add_argument("--splittype", type=str, default="scaffold",
                       choices=["scaffold", "random", "specified"],
                       help="Type of train/test data-splitting.\n"
                            "scaffold uses Bemis-Murcko scaffolds.\n"
                            "specified requires that split be in original data.\n"
                            "Evaluation performed upon this split of specified data.")
  group = eval_cmd.add_argument_group("metrics")
  group.add_argument("--compute-aucs", action="store_true", default=False,
                      help="Compute AUC for trained models on test set.")
@@ -199,16 +182,24 @@ def featurize_input(args):
      args.id_endpoint, "descriptors")
  generate_descriptors(df, args.name, args.out, args.smiles_endpoint, args.id_endpoint)

def transform_input(args):
  """Saves transformed model."""
  per_task_data = process_datasets(args.paths,
      args.input_transforms, output_transforms, feature_types=args.feature_types, 
      splittype=args.splittype, weight_positives=args.weight_positives,
      mode=args.mode)
  with gzip.open(args.out) as f:
    pickle.dump(per_task_data, f)

def train_model(args):
  """Builds model from featurized data."""
  targets = get_target_names(args.paths)
  task_types = {target: args.task_type for target in targets}
  output_transforms = {target: args.output_transforms for target in targets}

  per_task_data = process_datasets(args.paths,
      args.input_transforms, output_transforms, feature_types=args.feature_types, 
      splittype=args.splittype, weight_positives=args.weight_positives,
      mode=args.mode)
  with gzip.open(args.in) as f:
    per_task_data = pickle.load(f)

  if args.model == "singletask_deep_network":
    models = fit_singletask_mlp(per_task_data, task_types, n_hidden=args.n_hidden,
      learning_rate=args.learning_rate, dropout=args.dropout,
@@ -232,14 +223,14 @@ def train_model(args):
  save_model(models, modeltype, args.saved_out)

def eval_trained_model(args):
  model = load_model(args.modeltype, args.saved_in)
  model = load_model(args.modeltype, args.saved_model)
  targets = get_target_names(args.paths)
  task_types = {target: args.task_type for target in targets}

  with gzip.open(args.in) as f:
    per_task_data = pickle.load(f)

  output_transforms = {target: args.output_transforms for target in targets}
  per_task_data = process_datasets(args.paths,
      args.input_transforms, output_transforms, feature_types=args.feature_types, 
      splittype=args.splittype, weight_positives=False,
      mode=args.mode)
  results, aucs, r2s, rms = compute_model_performance(per_task_data, task_types, model, args.modeltype,
    args.compute_aucs, args.compute_r2s, args.compute_rms) 
  if args.csv_out is not None: