Commit f7801c00 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Adding features to allow prediction workflow

parent 5273d873
Loading
Loading
Loading
Loading
+81 −42
Original line number Diff line number Diff line
@@ -43,10 +43,6 @@ def add_featurize_group(featurize_cmd):
  featurize_group.add_argument(
      "--threshold", type=float, default=None,
      help="If specified, will be used to binarize real-valued target-fields.")
  featurize_group.add_argument(
      "--feature-dir", type=str, required=0,
      help="Directory where featurized dataset will be stored.\n"
           "Will be created if does not exist")
  featurize_group.add_argument(
      "--parallel", type=float, default=None,
      help="Use multiprocessing will be used to parallelize featurization.")
@@ -128,6 +124,7 @@ def add_model_group(fit_cmd):
      "--nesterov", action="store_true",
      help="If set, use Nesterov acceleration.")


def add_model_command(subparsers):
  """Adds flags for model subcommand."""
  model_cmd = subparsers.add_parser(
@@ -146,8 +143,26 @@ def add_model_command(subparsers):
      "--eval", action="store_true",
      help="Perform model eval step.")
  model_cmd.add_argument(
      "--base-dir", type=str, required=1,
      "--base-dir", type=str, default=None,
      help="The base directory for the model.")
  model_cmd.add_argument(
      "--feature-dir", type=str, default=None,
      help="The feature storage directory for the model.")
  model_cmd.add_argument(
      "--data-dir", type=str, default=None,
      help="The data storage directory for the model.")
  model_cmd.add_argument(
      "--model-dir", type=str, default=None,
      help="The model storage directory for the model.")
  model_cmd.add_argument(
      "--eval-train", type=bool, default=True,
      help="Evaluate model on train dataset.")
  model_cmd.add_argument(
      "--eval-test", type=bool, default=True,
      help="Evaluate model on test dataset.")
  model_cmd.add_argument(
      "--eval-full", type=bool, default=False,
      help="Evaluate model on full dataset.")
  add_featurize_group(model_cmd)

  add_transforms_group(model_cmd)
@@ -172,22 +187,31 @@ def ensure_exists(dirs):

def create_model(args):
  """Creates a model"""
  base_dir = args.base_dir
  feature_dir = os.path.join(base_dir, "features")
  data_dir = os.path.join(base_dir, "data")
  model_dir = os.path.join(base_dir, "model")
  ensure_exists([base_dir, feature_dir, data_dir, model_dir])

  model_name = args.model
  if args.base_dir is not None:
    feature_dir = os.path.join(args.base_dir, "features")
    data_dir = os.path.join(args.base_dir, "data")
    model_dir = os.path.join(args.base_dir, "model")
    ensure_exists([args.base_dir, feature_dir, data_dir, model_dir])
  else:
    if (args.model_dir is None or
        args.data_dir is None or
        args.feature_dir is None):
      raise ValueError("If base-dir not specified, must specify "
                       "feature-dir, data-dir, model-dir.")

    feature_dir, model_dir, data_dir = (args.feature_dir, args.model_dir,
                                        args.data_dir)  
    ensure_exists([feature_dir, data_dir, model_dir])
                

  print("+++++++++++++++++++++++++++++++++")
  print("Perform featurization")
  if args.featurize:
    featurize_inputs(
        feature_dir, args.input_files,
        args.user_specified_features, args.tasks,
        args.smiles_field, args.split_field, args.id_field, args.threshold,
        args.parallel)
        feature_dir, data_dir, args.input_files, args.user_specified_features,
        args.tasks, args.smiles_field, args.split_field, args.id_field,
        args.threshold, args.parallel)

  print("+++++++++++++++++++++++++++++++++")
  print("Perform train-test split")
@@ -205,24 +229,35 @@ def create_model(args):
        model_name, model_params, model_dir, data_dir)

  print("+++++++++++++++++++++++++++++++++")
  if args.eval:
    if args.eval_train:
      print("Eval Model on Train")
      print("-------------------")
  if args.eval:
      train_dir = os.path.join(data_dir, "train-data")
      csv_out_train = os.path.join(data_dir, "train.csv")
      stats_out_train = os.path.join(data_dir, "train-stats.txt")
    csv_out_test = os.path.join(data_dir, "test.csv")
    stats_out_test = os.path.join(data_dir, "test-stats.txt")
    train_dir = os.path.join(data_dir, "train-data")
      eval_trained_model(
          model_name, model_dir, train_dir, csv_out_train,
        stats_out_train, split="train")
          stats_out_train)

    if args.eval_test:
      print("Eval Model on Test")
      print("------------------")
  if args.eval:
      test_dir = os.path.join(data_dir, "test-data")
      csv_out_test = os.path.join(data_dir, "test.csv")
      stats_out_test = os.path.join(data_dir, "test-stats.txt")
      eval_trained_model(
          model_name, model_dir, test_dir, csv_out_test,
        stats_out_test, split="test")
          stats_out_test)

    if args.eval_full:
      print("Eval Model on Full Dataset")
      print("--------------------------")
      csv_out_full = os.path.join(data_dir, "full.csv")
      stats_out_full = os.path.join(data_dir, "full-stats.txt")
      eval_trained_model(
          model_name, model_dir, data_dir, csv_out_full,
          stats_out_full)

def parse_args(input_args=None):
  """Parse command-line arguments."""
@@ -231,7 +266,7 @@ def parse_args(input_args=None):
  add_model_command(subparsers)
  return parser.parse_args(input_args)

def featurize_inputs(feature_dir, input_files,
def featurize_inputs(feature_dir, data_dir, input_files,
                     user_specified_features, tasks, smiles_field,
                     split_field, id_field, threshold, parallel):

@@ -252,6 +287,11 @@ def featurize_inputs(feature_dir, input_files,
    for input_file in input_files:
      featurize_input_partial(input_file)

  dataset_files = glob.glob(os.path.join(feature_dir, "*.joblib"))
  print("Loading featurized data.")
  samples_dir = os.path.join(data_dir, "samples")
  samples = FeaturizedSamples(samples_dir, dataset_files)

def featurize_input(input_file, feature_dir, user_specified_features, tasks,
                    smiles_field, split_field, id_field, threshold):
  """Featurizes raw input data."""
@@ -270,12 +310,8 @@ def train_test_split(paths, input_transforms, output_transforms,
                     feature_types, splittype, mode, data_dir):
  """Saves transformed model."""

  dataset_files = []
  for path in paths:
    dataset_files += glob.glob(os.path.join(path, "*.joblib"))
  print("Loading featurized data.")
  samples_dir = os.path.join(data_dir, "samples")
  samples = FeaturizedSamples(samples_dir, dataset_files, reload=False)
  samples = FeaturizedSamples(samples_dir, reload=True)
  
  print("Split data into train/test")
  train_samples_dir = os.path.join(data_dir, "train-samples")
@@ -286,9 +322,10 @@ def train_test_split(paths, input_transforms, output_transforms,
  train_data_dir = os.path.join(data_dir, "train-data")
  test_data_dir = os.path.join(data_dir, "test-data")

  print("Generating train data.")
  print("Generating train dataset.")
  train_dataset = Dataset(train_data_dir, train_samples, feature_types)
  print("Generating test data.")

  print("Generating test dataset.")
  test_dataset = Dataset(test_data_dir, test_samples, feature_types)

  print("Transforming train data.")
@@ -311,13 +348,15 @@ def fit_model(model_name, model_params, model_dir, data_dir):
  model.save(model_dir)

def eval_trained_model(model_type, model_dir, data_dir,
                       csv_out, stats_out, split="test"):
                       csv_out, stats_out):
  """Evaluates a trained model on specified data."""
  model = Model.load(model_type, model_dir)
  data = Dataset(data_dir)

  evaluator = Evaluator(model, data, verbose=True)
  evaluator.compute_model_performance(csv_out, stats_out)
  pred_y_df, perf_df = evaluator.compute_model_performance(csv_out, stats_out)
  print("Model Performance.")
  print(perf_df)

def main():
  """Invokes argument parser."""
+3 −0
Original line number Diff line number Diff line
@@ -279,6 +279,9 @@ def write_dataset_single(val, data_dir, feature_types):

def _df_to_numpy(df, feature_types):
  """Transforms a featurized dataset df into standard set of numpy arrays"""
  if not set(feature_types).issubset(df.keys()):
    raise ValueError(
        "Featurized data does not support requested feature_types.")
  # perform common train/test split across all tasks
  n_samples = df.shape[0]
  sorted_tasks = FeaturizedSamples.get_sorted_task_names(df)
+4 −4
Original line number Diff line number Diff line
@@ -118,8 +118,8 @@ class Evaluator(object):
        rms = np.sqrt(mean_squared_error(y, y_pred))
        performance_df.loc[i] = [task_name, r2s, rms]

    if self.verbose:
      print("Saving model performance scores to %s" % stats_file)
    performance_df.to_csv(stats_file)

    print("Model performance scores:")
    print(performance_df)
    return pred_y_df, performance_df
+1 −4
Original line number Diff line number Diff line
@@ -179,7 +179,6 @@ class DataFeaturizer(object):
      df[task] = ori_df[[task]]
    if self.split_field is not None:
      df["split"] = ori_df[[self.split_field]]

    return df

  def _featurize_df(self, df, rows, feature_type):
@@ -197,7 +196,7 @@ class DataFeaturizer(object):
          features_data.append({feature_type: np.array(feature_list)})
        df[feature_type] = pd.DataFrame(features_data)
        return
    elif feature_type in ["ECFP", "RDKIT-descriptors", "NNScore"]:
    elif feature_type in ["ECFP", "RDKIT-descriptors"]:
      if feature_type == "ECFP":
        if self.verbose:
          print("Generating ECFP circular fingerprints.")
@@ -206,8 +205,6 @@ class DataFeaturizer(object):
        if self.verbose:
          print("Generating RDKIT descriptors.")
        featurizer = SimpleDescriptors()
      elif feature_type == "NNScore":
        pass
      features = []
      sample_smiles = df["smiles"].tolist()
      for ind, smiles in enumerate(sample_smiles):
+11 −0
Original line number Diff line number Diff line
mol_id,smiles,task0,task1,task2,task3,task4,task5,task6,task7,task8,task9,task10,task11,task12,task13,task14,task15,task16
CID2999678,Cc1cccc(N2CCN(C(=O)C34CC5CC(CC(C5)C3)C4)CC2)c1C,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
CID2999679,Cn1ccnc1SCC(=O)Nc1ccc(Oc2ccccc2)cc1,0,0,,0,,0,,,,,0,0,,,0,0,0
CID5390003,COc1ccccc1NC(=O)C1=C(C)N=C2N=CNN2C1c1ccc(C)o1,0,,,,,,,0,,0,,0,,,,,
CID5390002,O=C1c2ccccc2/C(=C\NC2CCS(=O)(=O)C2)C(=O)N1c1ccccc1,,0,,,,0,,0,,,,0,,,,,
CID2999670,NC(=O)NC(Cc1ccccc1)C(=O)O,0,,,,,,,,,0,,,,,0,0,
CID5390000,Cc1[nH]nc(NC=C2C(=O)N(C)C(=O)N(C)C2=O)c1-c1ccccc1,,,,,,,,0,,,,,,,,,
CID1511280,CC(=O)N1CCC2(CC1)NC(=O)N(c1ccccc1)N2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
CID5390006,COc1ccc(C2C(C(=O)NCc3ccccc3)=C(C)N=C3N=CNN32)cc1OC,,,,,,,,,,0,0,,,,,,
CID5390005,CCc1cc2c(CN3CCN(C)CC3)cc(=O)oc2cc1O,,,,,,,,,,0,,,,,,,
CID5390004,CCc1cc2c(CN3CCN(CCO)CC3)cc(=O)oc2cc1O,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Loading