Commit 2860de81 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Cleaned up modeler, but eval still broken.

parent f440639b
Loading
Loading
Loading
Loading
+4 −99
Original line number Diff line number Diff line
@@ -22,12 +22,6 @@ import deepchem.models.deep3d
# the --skip-foo flags, it's possible to run all functionality directly through
# create_model. Perhaps trim the fat and delete the remaining commands.

def add_featurization_command(subparsers):
  """Adds flags for featurize subcommand."""
  featurize_cmd = subparsers.add_parser(
      "featurize", help="Featurize raw input data.")
  add_featurize_group(featurize_cmd)

def add_featurize_group(featurize_cmd):
  """Adds flags for featurizization."""
  featurize_group = featurize_cmd.add_argument_group("Input Specifications")
@@ -60,7 +54,6 @@ def add_featurize_group(featurize_cmd):
  featurize_group.add_argument(
      "--parallel", type=float, default=None,
      help="Use multiprocessing will be used to parallelize featurization.")
  featurize_group.set_defaults(func=featurize_inputs_wrapper)

def add_transforms_group(cmd):
  """Adds flags for data transforms."""
@@ -94,21 +87,6 @@ def add_transforms_group(cmd):
      "--weight-positives", type=bool, default=False,
      help="Weight positive examples to have same total weight as negatives.")

def add_train_test_command(subparsers):
  """Adds flags for train-test-split subcommand."""
  train_test_cmd = subparsers.add_parser(
      "train-test-split",
      help="Apply standard data transforms to raw features generated by featurize,\n"
           "then split data into train/test and store data as (X,y) matrices.")
  add_transforms_group(train_test_cmd)
  train_test_cmd.add_argument(
      "--paths", nargs="+", required=1,
      help="Paths to input datasets.")
  train_test_cmd.add_argument(
      "--data-dir", type=str, required=1,
      help="Location to save train and test data.")
  train_test_cmd.set_defaults(func=train_test_split_wrapper)

def add_model_group(fit_cmd):
  """Adds flags for specifying models."""
  group = fit_cmd.add_argument_group("model")
@@ -154,49 +132,6 @@ def add_model_group(fit_cmd):
      "--nesterov", action="store_true",
      help="If set, use Nesterov acceleration.")

def add_fit_command(subparsers):
  """Adds arguments for fit subcommand."""
  fit_cmd = subparsers.add_parser(
      "fit", help="Fit a model to training data.")
  group = fit_cmd.add_argument_group("load-and-transform")
  group.add_argument(
      "--data-dir", required=1,
      help="Location of saved transformed data.")
  add_model_group(fit_cmd)
  group = fit_cmd.add_argument_group("save")
  group.add_argument(
      "--model-dir", type=str, required=1,
      help="Location to save trained model.")
  fit_cmd.set_defaults(func=fit_model_wrapper)

def add_eval_command(subparsers):
  """Adds arguments for eval subcommand."""
  eval_cmd = subparsers.add_parser(
      "eval",
      help="Evaluate trained model on test data processed by transform.")
  group = eval_cmd.add_argument_group("load model/data")
  group.add_argument(
      "--saved-model", type=str, required=1,
      help="Location from which to load saved model.")
  group.add_argument(
      "--saved-data", required=1, help="Location of saved transformed data.")
  eval_cmd.add_argument(
      "--csv-out", type=str, required=1,
      help="Outputted predictions on evaluated set.")
  eval_cmd.add_argument(
      "--stats-out", type=str, required=1j,
      help="Computed statistics on evaluated set.")
  eval_cmd.set_defaults(func=eval_trained_model_wrapper)

def add_predict_command(subparsers):
  """Adds arguments for predict subcommand."""
  predict_cmd = subparsers.add_parser(
    "predict",
    help="Make predictions of model on new data.")
  #group = predict_cmd.add_a

# TODO(rbharath): There are a lot of duplicate commands introduced here. Is
# there a nice way to factor them?
def add_model_command(subparsers):
  """Adds flags for model subcommand."""
  model_cmd = subparsers.add_parser(
@@ -297,25 +232,9 @@ def parse_args(input_args=None):
  """Parse command-line arguments."""
  parser = argparse.ArgumentParser()
  subparsers = parser.add_subparsers(title='Modes')

  add_featurization_command(subparsers)
  add_train_test_command(subparsers)
  add_fit_command(subparsers)
  add_eval_command(subparsers)

  add_model_command(subparsers)

  return parser.parse_args(input_args)

def featurize_inputs_wrapper(args):
  """Wrapper function that calls _featurize_input with args unwrapped."""
  if not os.path.exists(args.feature_dir):
    os.makedirs(args.feature_dir)
  featurize_inputs(
      args.feature_dir, args.input_files, args.user_specified_features,
      args.tasks, args.smiles_field, args.split_field, args.id_field,
      args.threshold)

def featurize_inputs(feature_dir, input_files,
                     user_specified_features, tasks, smiles_field,
                     split_field, id_field, threshold, parallel):
@@ -351,12 +270,6 @@ def featurize_input(input_file, feature_dir, user_specified_features, tasks,
      feature_dir, "%s.joblib" %(os.path.splitext(os.path.basename(input_file))[0]))
  featurizer.featurize(input_file, FeaturizedSamples.feature_types, out)

def train_test_split_wrapper(args):
  """Wrapper function that calls _train_test_split_wrapper after unwrapping args."""
  train_test_split(args.paths, args.input_transforms, 
                   args.output_transforms, args.feature_types,
                   args.splittype, args.mode, args.data_dir)

def train_test_split(paths, input_transforms, output_transforms,
                     feature_types, splittype, mode, data_dir):
  """Saves transformed model."""
@@ -391,12 +304,6 @@ def train_test_split(paths, input_transforms, output_transforms,
  print("Transforming test data.")
  test_dataset.transform(input_transforms, output_transforms)

def fit_model_wrapper(args):
  """Wrapper that calls _fit_model with arguments unwrapped."""
  model_params = extract_model_params(args)
  fit_model(
      args.model_name, model_params, args.model_dir, args.data_dir)

def fit_model(model_name, model_params, model_dir, data_dir):
  """Builds model from featurized data."""
  task_type = Model.get_task_type(model_name)
@@ -410,16 +317,14 @@ def fit_model(model_name, model_params, model_dir, data_dir):
  model.fit(train)
  model.save(model_dir)

def eval_trained_model_wrapper(args):
  """Wrapper function that calls _eval_trained_model with unwrapped args."""
  eval_trained_model(
      args.model, args.model_dir, args.data_dir,
      args.csv_out, args.stats_out, split="test")

def eval_trained_model(model_type, model_dir, data_dir,
                       csv_out, stats_out, split="test"):
  """Evaluates a trained model on specified data."""
  model = Model.load(model_type, model_dir)
  print("eval_trained_model()")
  print("data_dir")
  print(data_dir)
  
  data = Dataset(data_dir)

  evaluator = Evaluator(model, data, verbose=True)
+13 −4
Original line number Diff line number Diff line
@@ -39,8 +39,11 @@ class Dataset(object):
          feature_types=feature_types)

      metadata_rows = []
      for df_file in samples.dataset_files:
        metadata_rows.append(write_dataset_single_partial(df_file))
      # TODO(rbharath): Still a bit of information leakage.
      for df_file, df in zip(samples.dataset_files, samples.itersamples()):
        retval = write_dataset_single_partial((df_file, df))
        if retval is not None:
          metadata_rows.append(retval)

      # TODO(rbharath): FeaturizedSamples should not be responsible for
      # X-transform, X_sums, etc. Move that stuff over to Dataset.
@@ -246,9 +249,12 @@ def compute_sums_and_nb_sample(tensor, W=None):
# The following are all associated with Dataset, but are separate functions to
# make it easy to use multiprocessing.

def write_dataset_single(df_file, data_dir, feature_types):
def write_dataset_single(val, data_dir, feature_types):
  (df_file, df) = val
  print("Examining %s" % df_file)
  df = load_from_disk(df_file)
  # TODO(rbharath): This is a hack. clean up.
  if not len(df):
    return None
  task_names = FeaturizedSamples.get_sorted_task_names(df)
  ids, X, y, w = df_to_numpy(df, feature_types)
  X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X)
@@ -286,6 +292,9 @@ def df_to_numpy(df, feature_types):
      feature_list.append(datapoint[feature_type])
    features = np.squeeze(np.concatenate(feature_list))
    tensors.append(features)
  print("df_to_numpy()")
  print("len(df)")
  print(len(df))
  x = np.stack(tensors)

  # Remove entries with missing labels
+16 −0
Original line number Diff line number Diff line
@@ -299,6 +299,22 @@ class FeaturizedSamples(object):
    save_to_disk(df, self._get_compounds_filename())
    self.compounsd_df = df

  # TODO(rbharath): Might this be inefficient?
  def itersamples(self):
    """
    Provides an iterator over samples.
    
    Each sample from the iterator is a dataframe of samples.
    """
    compound_ids = set(list(self.compounds_df["mol_id"]))
    for df_file in self.dataset_files:
      df = load_from_disk(df_file)
      visible_inds = []
      for ind, row in df.iterrows():
        if row["mol_id"] in compound_ids:
          visible_inds.append(ind)
      yield df.iloc[visible_inds]

  def train_test_split(self, splittype, train_dir, test_dir, seed=None,
                       frac_train=.8):
    """