Commit b13e058a authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Singletask sklearn/keras models training.

parent 11d9efd6
Loading
Loading
Loading
Loading
+24 −15
Original line number Diff line number Diff line
@@ -53,8 +53,11 @@ def fit_singletask_mlp(train_data, task_types, **training_params):
    print "Training model %d" % index
    print "Target %s" % task
    (y_train, W_train) = train_data[task]
    flat_W_train = W_train.ravel()
    task_X_train = X_train[flat_W_train.nonzero()]
    task_y_train = y_train[flat_W_train.nonzero()]
    print "%d compounds in Train" % len(train_ids)
    models[task] = train_multitask_model(X_train, y_train, W_train,
    models[task] = train_multitask_model(task_X_train, task_y_train, W_train,
        {task: task_types[task]}, **training_params)
  return models

@@ -93,39 +96,45 @@ def train_multitask_model(X, y, W, task_types,
  sorted_tasks = sorted(task_types.keys())
  local_task_types = task_types.copy()
  endpoints = sorted_tasks
  (_, n_inputs) = np.shape(X[0].flatten())
  print "train_multitask_model()"
  print "np.shape(X)"
  print np.shape(X)
  n_inputs = len(X[0].flatten())
  # Add eps weight to avoid minibatches with zero weight (causes theano to crash).
  W = W + eps * np.ones(np.shape(W))
  print "np.shape(W)"
  print np.shape(W)
  model = Graph()
  model.add_input(name="input", ndim=n_inputs)
  #model.add_input(name="input", ndim=n_inputs)
  model.add_input(name="input", input_shape=(n_inputs,))
  model.add_node(
      Dense(n_inputs, n_hidden, init='uniform', activation=activation),
      Dense(n_hidden, init='uniform', activation=activation),
      name="dense", input="input")
  model.add_node(Dropout(dropout), name="dropout", input="dense")
  top_layer = "dropout"
  for task, task in enumerate(endpoints):
  for ind, task in enumerate(endpoints):
    task_type = local_task_types[task]
    if task_type == "classification":
      model.add_node(
          Dense(n_hidden, 2, init='uniform', activation="softmax"),
          name="dense_head%d" % task, input=top_layer)
          Dense(2, init='uniform', activation="softmax"),
          name="dense_head%d" % ind, input=top_layer)
    elif task_type == "regression":
      model.add_node(
          Dense(n_hidden, 1, init='uniform'),
          name="dense_head%d" % task, input=top_layer)
    model.add_output(name="task%d" % task, input="dense_head%d" % task)
          Dense(1, init='uniform'),
          name="dense_head%d" % ind, input=top_layer)
    model.add_output(name="task%d" % ind, input="dense_head%d" % ind)
  data_dict, loss_dict, sample_weights = {}, {}, {}
  data_dict["input"] = X
  for task, task in enumerate(endpoints):
  for ind, task in enumerate(endpoints):
    task_type = local_task_types[task]
    taskname = "task%d" % task
    sample_weights[taskname] = W[:, task]
    taskname = "task%d" % ind 
    sample_weights[taskname] = W[:,ind]
    if task_type == "classification":
      loss_dict[taskname] = "binary_crossentropy"
      data_dict[taskname] = to_one_hot(y[:,task])
      data_dict[taskname] = to_one_hot(y[:,ind])
    elif task_type == "regression":
      loss_dict[taskname] = "mean_squared_error"
      data_dict[taskname] = y[:,task]
      data_dict[taskname] = y[:,ind]
  sgd = SGD(lr=learning_rate, decay=decay, momentum=momentum, nesterov=nesterov)
  print "About to compile model!"
  model.compile(optimizer=sgd, loss=loss_dict)
+12 −2
Original line number Diff line number Diff line
@@ -31,11 +31,21 @@ def fit_singletask_models(train_data, modeltype):
  print "fit_singletask_models()"
  print "train_data.keys()"
  print train_data.keys()
  import numpy as np
  X_train = train_data["features"]
  print "np.shape(X_train)"
  print np.shape(X_train)
  sorted_tasks = train_data["sorted_tasks"]
  for task in sorted_tasks:
    print "Building model for task %s" % task
    (y_train, _) = train_data[task]
    (y_train, W_train) = train_data[task]
    W_train = W_train.ravel()
    task_X_train = X_train[W_train.nonzero()]
    task_y_train = y_train[W_train.nonzero()]
    print "np.shape(task_X_train)"
    print np.shape(task_X_train)
    print "np.shape(task_y_train)"
    print np.shape(task_y_train)
    if modeltype == "rf_regressor":
      model = RandomForestRegressor(
          n_estimators=500, n_jobs=-1, warm_start=True, max_features="sqrt")
@@ -56,7 +66,7 @@ def fit_singletask_models(train_data, modeltype):
      model = ElasticNetCV(max_iter=2000, n_jobs=-1)
    else:
      raise ValueError("Invalid model type provided.")
    model.fit(X_train, y_train.ravel())
    model.fit(task_X_train, task_y_train.ravel())
    models[task] = model
  return models

+11 −5
Original line number Diff line number Diff line
@@ -36,10 +36,13 @@ def compute_model_performance(raw_test_data, test_data, task_types, models,
    print("Target %s" % target, file=print_file)
    (y_test, w_test) = test_data[target]
    (ytest_raw, _) = raw_test_data[target]
    #model = models[target]
    model = models.itervalues().next()
    w_test = w_test.ravel()
    task_X_test = X_test[w_test.nonzero()]
    task_y_test = y_test[w_test.nonzero()]
    task_y_test_raw = y_test[w_test.nonzero()]
    model = models[target]
    results = eval_model(
        test_ids, X_test, y_test, ytest_raw, w_test, model,
        test_ids, task_X_test, task_y_test, task_y_test_raw, model,
        {target: task_types[target]}, modeltype=modeltype,
        output_transforms=output_transforms)
    all_results[target] = results[target]
@@ -54,7 +57,7 @@ def compute_model_performance(raw_test_data, test_data, task_types, models,
    if recall:
      recall_vals.update(compute_recall_score(results, task_types))
    if accuracy:
      recall_vals.update(compute_accuracy_score(results, task_types))
      accuracy_vals.update(compute_accuracy_score(results, task_types))

  if aucs:
    print("Mean AUC: %f" % np.mean(np.array(auc_vals.values())), file=print_file)
@@ -111,6 +114,9 @@ def model_predictions(X, model, n_targets, task_types, modeltype="sklearn"):
    # Must be single-task (breaking multitask RFs here)
    task_type = task_types.itervalues().next()
    if task_type == "classification":
      print("model_predictions()")
      print("np.shape(X)")
      print(np.shape(X))
      ypreds = model.predict_proba(X)
    elif task_type == "regression":
      ypreds = model.predict(X)
@@ -124,7 +130,7 @@ def model_predictions(X, model, n_targets, task_types, modeltype="sklearn"):
    ypreds = [ypreds]
  return ypreds

def eval_model(ids, X, Ytrue, Ytrue_raw, W, model, task_types,
def eval_model(ids, X, Ytrue, Ytrue_raw, model, task_types,
               output_transforms, modeltype="sklearn"):
  """Evaluates the provided model on the test-set.