Commit a247f348 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Multitask fixes.

parent de7eb3fe
Loading
Loading
Loading
Loading
+0 −4
Original line number Diff line number Diff line
@@ -42,10 +42,6 @@ def fit_singletask_models(train_data, modeltype):
    W_train = W_train.ravel()
    task_X_train = X_train[W_train.nonzero()]
    task_y_train = y_train[W_train.nonzero()]
    print "np.shape(task_X_train)"
    print np.shape(task_X_train)
    print "np.shape(task_y_train)"
    print np.shape(task_y_train)
    if modeltype == "rf_regressor":
      model = RandomForestRegressor(
          n_estimators=500, n_jobs=-1, warm_start=True, max_features="sqrt")
+5 −7
Original line number Diff line number Diff line
@@ -35,12 +35,12 @@ def add_featurize_group(featurize_cmd):
      help="Input file with data.")
  featurize_group.add_argument(
      "--input-type", default="csv",
      choices=["xlsx", "csv", "pandas", "sdf"],
      choices=["csv", "pandas", "sdf"],
      help="Type of input file. If pandas, input must be a pkl.gz\n"
           "containing a pandas dataframe. If sdf, should be in\n"
           "(perhaps gzipped) sdf file.")
  featurize_group.add_argument(
      "--delimiter", default=",",
      "--delimiter", default=",", type=str,
      help="If csv input, delimiter to use for read csv file")
  featurize_group.add_argument(
      "--fields", required=1, nargs="+",
@@ -293,13 +293,12 @@ def create_model(args):
  print("+++++++++++++++++++++++++++++++++")
  print("Perform train-test split")
  paths = [data_dir]
  weight_positives = False  # Hard coding this for now
  train_out = os.path.join(data_dir, "%s-train.joblib" % args.name)
  test_out = os.path.join(data_dir, "%s-test.joblib" % args.name)
  if not args.skip_train_test_split:
    _train_test_input(
        paths, args.output_transforms, args.input_transforms, args.feature_types,
        args.splittype, weight_positives, args.mode, train_out, test_out,
        args.splittype, args.mode, train_out, test_out,
        args.target_fields)

  print("+++++++++++++++++++++++++++++++++")
@@ -388,18 +387,17 @@ def train_test_input(args):
  """Wrapper function that calls _train_test_input after unwrapping args."""
  _train_test_input(
      args.paths, args.output_transforms, args.input_transforms,
      args.feature_types, args.splittype, args.weight_positives, args.mode,
      args.feature_types, args.splittype, args.mode,
      args.train_out, args.test_out, args.target_fields)

def _train_test_input(paths, output_transforms, input_transforms,
                      feature_types, splittype, weight_positives, mode,
                      feature_types, splittype, mode,
                      train_out, test_out, target_names):
  """Saves transformed model."""
  if output_transforms == "" or output_transforms == "None":
    output_transforms = []
  else:
    output_transforms = output_transforms.split(",")
  output_transforms_dict = {target: output_transforms for target in target_names}
  feature_types = feature_types.split(",")
  print("About to process_dataset")
  train_dict, test_dict = process_datasets(
+44 −10
Original line number Diff line number Diff line
@@ -74,17 +74,29 @@ def compute_model_performance(raw_test_data, test_data, task_types, models,
      rms_vals.update(compute_rms_scores(results, task_types))
  # Print classification metrics
  if aucs:
    print("AUCs", file=print_file)
    print(auc_vals, file=print_file)
    print("Mean AUC: %f" % np.mean(np.array(auc_vals.values())), file=print_file)
  if mcc:
    print("MCCs", file=print_file)
    print(mcc_vals, file=print_file)
    print("Mean MCC: %f" % np.mean(np.array(mcc_vals.values())), file=print_file)
  if recall:
    print("Recalls", file=print_file)
    print(recall_vals, file=print_file)
    print("Mean Recall: %f" % np.mean(np.array(recall_vals.values())), file=print_file)
  if accuracy:
    print("Accuracies", file=print_file)
    print(accuracy_vals, file=print_file)
    print("Mean Accuracy: %f" % np.mean(np.array(accuracy_vals.values())), file=print_file)
  # Print regression metrics
  if r2s:
    print("R^2s", file=print_file)
    print(r2_vals, file=print_file)
    print("Mean R^2: %f" % np.mean(np.array(r2_vals.values())), file=print_file)
  if rms:
    print("RMSs", file=print_file)
    print(rms_vals, file=print_file)
    print("Mean RMS: %f" % np.mean(np.array(rms_vals.values())), file=print_file)

  return all_results, aucs, r2s, rms
@@ -179,19 +191,41 @@ def eval_model(ids, X, Ytrue, Ytrue_raw, model, task_types,

def results_to_csv(results, out, task_type="classification"):
  """Writes results as CSV to out."""
  for target in results:
    mol_ids, ytrues, yscores = results[target]
  sorted_tasks = sorted(results.keys())
  processed_results = {}
  for task in sorted_tasks:
    mol_ids, ytrues, yscores = results[task]
    if task_type == "classification":
      yscores = np.around(yscores[:, 1]).astype(int)
    elif task_type == "regression":
      if isinstance(yscores[0], np.ndarray):
        yscores = yscores[:, 0]
    for (mol_id, ytrue, yscore) in zip(mol_ids, ytrues, yscores):
      if mol_id not in processed_results:
        processed_results[mol_id] = {}
      processed_results[mol_id][task] = (ytrue, yscore)
  with open(out, "wb") as csvfile:
    csvwriter = csv.writer(csvfile, delimiter="\t")
      csvwriter.writerow(["Ids", "True", "Model-Prediction"])
      for mol_id, ytrue, yscore in zip(mol_ids, ytrues, yscores):
        csvwriter.writerow([mol_id, ytrue, yscore])
    print("Writing results on test set for target %s to %s" % (target, out))
    colnames = ["Ids"]
    for task in sorted_tasks:
      colnames += ["%s-True" % task, "%s-Pred" % task]
    csvwriter.writerow(colnames)
    for mol_id in processed_results:
      col = [mol_id]
      for task in sorted_tasks:
        if task in processed_results[mol_id]:
          ytrue, yscore = processed_results[mol_id][task]
          # TODO(rbharath): Missing data handling is broken for multitask
          # regression models! Find a more general solution (perhaps by using
          # NaNs in place of -1)
          # Handle missing data case
          if ytrue == -1:
            (ytrue, yscore) = ("", "")
        else:
          (ytrue, yscore) = ("", "")
        col += [ytrue, yscore]
      csvwriter.writerow(col)
  print("Writing results on test set to %s" % out)

def compute_r2_scores(results, task_types):
  """Transforms the results dict into R^2 values and prints them.