Commit 11d9efd6 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Singletask now running, but need to handle missing values cogently.

parent 6f4ba45d
Loading
Loading
Loading
Loading
+16 −16
Original line number Diff line number Diff line
@@ -18,7 +18,7 @@ def fit_multitask_mlp(train_data, task_types, **training_params):
  Parameters
  ----------
  task_types: dict 
    dict mapping target names to output type. Each output type must be either
    dict mapping task names to output type. Each output type must be either
    "classification" or "regression".
  training_params: dict
    Aggregates keyword parameters to pass to train_multitask_model
@@ -37,10 +37,10 @@ def fit_singletask_mlp(train_data, task_types, **training_params):
  Perform stochastic gradient descent optimization for a keras MLP.

  task_types: dict 
    dict mapping target names to output type. Each output type must be either
    dict mapping task names to output type. Each output type must be either
    "classification" or "regression".
  output_transforms: dict 
    dict mapping target names to label transform. Each output type must be either
    dict mapping task names to label transform. Each output type must be either
    None or "log". Only for regression outputs.
  training_params: dict
    Aggregates keyword parameters to pass to train_multitask_model
@@ -48,14 +48,14 @@ def fit_singletask_mlp(train_data, task_types, **training_params):
  models = {}
  train_ids = train_data["mol_ids"]
  X_train = train_data["features"]
  sorted_targets = train_data["sorted_targets"]
  for index, target in enumerate(sorted_targets):
  sorted_tasks = train_data["sorted_tasks"]
  for index, task in enumerate(sorted_tasks):
    print "Training model %d" % index
    print "Target %s" % target
    (y_train, W_train) = train_data[target]
    print "Target %s" % task
    (y_train, W_train) = train_data[task]
    print "%d compounds in Train" % len(train_ids)
    models[target] = train_multitask_model(X_train, y_train, W_train,
        {target: task_types[target]}, **training_params)
    models[task] = train_multitask_model(X_train, y_train, W_train,
        {task: task_types[task]}, **training_params)
  return models

def train_multitask_model(X, y, W, task_types,
@@ -75,7 +75,7 @@ def train_multitask_model(X, y, W, task_types,
  W: np.ndarray
    Weight matrix
  task_types: dict 
    dict mapping target names to output type. Each output type must be either
    dict mapping task names to output type. Each output type must be either
    "classification" or "regression".
  learning_rate: float
    Learning rate used.
@@ -90,9 +90,9 @@ def train_multitask_model(X, y, W, task_types,
  """
  eps = .001
  num_tasks = len(task_types)
  sorted_targets = sorted(task_types.keys())
  sorted_tasks = sorted(task_types.keys())
  local_task_types = task_types.copy()
  endpoints = sorted_targets
  endpoints = sorted_tasks
  (_, n_inputs) = np.shape(X[0].flatten())
  # Add eps weight to avoid minibatches with zero weight (causes theano to crash).
  W = W + eps * np.ones(np.shape(W))
@@ -103,8 +103,8 @@ def train_multitask_model(X, y, W, task_types,
      name="dense", input="input")
  model.add_node(Dropout(dropout), name="dropout", input="dense")
  top_layer = "dropout"
  for task, target in enumerate(endpoints):
    task_type = local_task_types[target]
  for task, task in enumerate(endpoints):
    task_type = local_task_types[task]
    if task_type == "classification":
      model.add_node(
          Dense(n_hidden, 2, init='uniform', activation="softmax"),
@@ -116,8 +116,8 @@ def train_multitask_model(X, y, W, task_types,
    model.add_output(name="task%d" % task, input="dense_head%d" % task)
  data_dict, loss_dict, sample_weights = {}, {}, {}
  data_dict["input"] = X
  for task, target in enumerate(endpoints):
    task_type = local_task_types[target]
  for task, task in enumerate(endpoints):
    task_type = local_task_types[task]
    taskname = "task%d" % task
    sample_weights[taskname] = W[:, task]
    if task_type == "classification":
+4 −4
Original line number Diff line number Diff line
@@ -13,12 +13,12 @@ def fit_3D_convolution(train_data, task_types, **training_params):
  """
  models = {}
  X_train = train_data["features"]
  if len(train_data["sorted_targets"]) > 1:
  if len(train_data["sorted_tasks"]) > 1:
    raise ValueError("3D Convolutions only supported for singletask.")
  target_name = train_data["sorted_targets"][0]
  (y_train, _) = train_data["sorted_targets"].itervalues().next()
  task_name = train_data["sorted_tasks"][0]
  (y_train, _) = train_data["sorted_tasks"].itervalues().next()
  nb_classes = 2
  models[target_name] = train_3D_convolution(X_train, y_train, **training_params)
  models[task_name] = train_3D_convolution(X_train, y_train, **training_params)
  return models

def train_3D_convolution(X, y, batch_size=50, nb_epoch=1,learning_rate=0.01,
+9 −6
Original line number Diff line number Diff line
@@ -24,15 +24,18 @@ def fit_singletask_models(train_data, modeltype):
  seed: int (optional)
    Seed to initialize np.random.
  output_transforms: dict
    dict mapping target names to label transform. Each output type must be either
    dict mapping task names to label transform. Each output type must be either
    None or "log". Only for regression outputs.
  """
  models = {}
  print "fit_singletask_models()"
  print "train_data.keys()"
  print train_data.keys()
  X_train = train_data["features"]
  sorted_targets = train_data["sorted_targets"]
  for target in sorted_targets:
    print "Building model for target %s" % target
    (y_train, _) = train_data[target]
  sorted_tasks = train_data["sorted_tasks"]
  for task in sorted_tasks:
    print "Building model for task %s" % task
    (y_train, _) = train_data[task]
    if modeltype == "rf_regressor":
      model = RandomForestRegressor(
          n_estimators=500, n_jobs=-1, warm_start=True, max_features="sqrt")
@@ -54,7 +57,7 @@ def fit_singletask_models(train_data, modeltype):
    else:
      raise ValueError("Invalid model type provided.")
    model.fit(X_train, y_train.ravel())
    models[target] = model
    models[task] = model
  return models

## TODO(rbharath): I believe this is broken. Update it to work with the rest of
+27 −13
Original line number Diff line number Diff line
@@ -165,7 +165,7 @@ def add_fit_command(subparsers):
      "fit", help="Fit a model to training data.")
  group = fit_cmd.add_argument_group("load-and-transform")
  group.add_argument(
      "--task-type", default="classification",
      "--task-type", required=1,
      choices=["classification", "regression"],
      help="Type of learning task.")
  group.add_argument(
@@ -206,7 +206,7 @@ def add_eval_command(subparsers):
  # TODO(rbharath): This argument seems a bit extraneous. Is it really
  # necessary?
  group.add_argument(
      "--task-type", default="classification",
      "--task-type", required=1,
      choices=["classification", "regression"],
      help="Type of learning task.")
  group = eval_cmd.add_argument_group("Classification metrics")
@@ -249,6 +249,12 @@ def add_model_command(subparsers):
  model_cmd.add_argument(
      "--skip-featurization", action="store_true",
      help="If set, skip the featurization step.")
  model_cmd.add_argument(
      "--skip-train-test-split", action="store_true",
      help="If set, skip the train-test-split step.")
  model_cmd.add_argument(
      "--skip-fit", action="store_true",
      help="If set, skip model fit step.")
  add_featurize_group(model_cmd)

  train_test_group = model_cmd.add_argument_group("train_test_group")
@@ -300,6 +306,7 @@ def create_model(args):
  weight_positives = False  # Hard coding this for now
  train_out = os.path.join(data_dir, "%s-train.joblib" % args.name)
  test_out = os.path.join(data_dir, "%s-test.joblib" % args.name)
  if not args.skip_train_test_split:
    _train_test_input(
        paths, args.output_transforms, args.input_transforms, args.feature_types,
        args.splittype, weight_positives, args.mode, train_out, test_out,
@@ -310,6 +317,7 @@ def create_model(args):
  modeltype = get_model_type(args.model)
  extension = get_model_extension(modeltype)
  saved_out = os.path.join(data_dir, "%s.%s" % (args.model, extension))
  if not args.skip_fit:
    _fit_model(
        paths, args.model, args.task_type, args.n_hidden, args.learning_rate,
        args.dropout, args.n_epochs, args.decay, args.batch_size, args.loss_function,
@@ -326,6 +334,9 @@ def create_model(args):
  compute_aucs, compute_recall, compute_accuracy, compute_matthews_corrcoef = (
      False, False, False, False)
  compute_r2s, compute_rms = False, False
  print "create_model()"
  print "args.task_type"
  print args.task_type
  if args.task_type == "classification":
    compute_aucs, compute_recall, compute_accuracy, compute_matthews_corrcoef = (
        True, True, True, True)
@@ -335,6 +346,8 @@ def create_model(args):
      modeltype, saved_out, train_out, paths, args.task_type, compute_aucs,
      compute_recall, compute_accuracy, compute_matthews_corrcoef, compute_r2s,
      compute_rms, csv_out_train, stats_out_train, args.target_fields)
  print "(compute_aucs, compute_recall, compute_accuracy, compute_matthews_corrcoef, compute_r2s, compute_rms)"
  print (compute_aucs, compute_recall, compute_accuracy, compute_matthews_corrcoef, compute_r2s, compute_rms)
  print "Eval Model on Test"
  print "------------------"
  _eval_trained_model(
@@ -517,8 +530,9 @@ def _eval_trained_model(modeltype, saved_model, saved_data, paths, task_type,
  with open(stats_out, "wb") as stats_file:
    results, _, _, _ = compute_model_performance(
        raw_test_dict, test_dict, task_types, model, modeltype,
        output_transforms, compute_aucs, compute_r2s, compute_rms, compute_recall,
        compute_accuracy, compute_matthews_corrcoef, print_file=stats_file)
        output_transforms, aucs=compute_aucs, r2s=compute_r2s, rms=compute_rms,
        recall=compute_recall, accuracy=compute_accuracy,
        mcc=compute_matthews_corrcoef, print_file=stats_file)
  with open(stats_out, "r") as stats_file:
    print stats_file.read()
  results_to_csv(results, csv_out, task_type=task_type)
+36 −36
Original line number Diff line number Diff line
@@ -33,42 +33,42 @@ def summarize_distribution(y):
  print "Histogram: "
  print hist

def analyze_data(dataset, splittype="random"):
  """Analyzes regression dataset.

  Parameters
  ----------
  dataset: dict
    A dictionary of type produced by load_datasets.
  splittype: string
    Type of split for train/test. Either random or scaffold.
  """
  singletask = multitask_to_singletask(dataset)
  for target in singletask:
    data = singletask[target]
    if len(data.keys()) == 0:
      continue
    if splittype == "random":
      train, test = train_test_random_split(data, seed=0)
    elif splittype == "scaffold":
      train, test = train_test_scaffold_split(data)
    else:
      raise ValueError("Improper splittype. Must be random/scaffold.")
    _, Xtrain, ytrain, _ = dataset_to_numpy(train)
    # TODO(rbharath): Take this out once debugging is completed
    ytrain = np.log(ytrain)
    mean = np.mean(ytrain)
    std = np.std(ytrain)
    minval = np.amin(ytrain)
    maxval = np.amax(ytrain)
    hist = np.histogram(ytrain)
    print target
    print "Mean: %f" % mean
    print "Std: %f" % std
    print "Min: %f" % minval
    print "Max: %f" % maxval
    print "Histogram: "
    print hist
#def analyze_data(dataset, splittype="random"):
#  """Analyzes regression dataset.
#
#  Parameters
#  ----------
#  dataset: dict
#    A dictionary of type produced by load_datasets.
#  splittype: string
#    Type of split for train/test. Either random or scaffold.
#  """
#  singletask = multitask_to_singletask(dataset)
#  for target in singletask:
#    data = singletask[target]
#    if len(data.keys()) == 0:
#      continue
#    if splittype == "random":
#      train, test = train_test_random_split(data, seed=0)
#    elif splittype == "scaffold":
#      train, test = train_test_scaffold_split(data)
#    else:
#      raise ValueError("Improper splittype. Must be random/scaffold.")
#    _, Xtrain, ytrain, _ = dataset_to_numpy(train)
#    # TODO(rbharath): Take this out once debugging is completed
#    ytrain = np.log(ytrain)
#    mean = np.mean(ytrain)
#    std = np.std(ytrain)
#    minval = np.amin(ytrain)
#    maxval = np.amax(ytrain)
#    hist = np.histogram(ytrain)
#    print target
#    print "Mean: %f" % mean
#    print "Std: %f" % std
#    print "Min: %f" % minval
#    print "Max: %f" % maxval
#    print "Histogram: "
#    print hist


def compare_all_datasets():
Loading