Commit 836dcfbe authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Basic structure of FeaturizedDataset.

parent 2f703bf4
Loading
Loading
Loading
Loading
+0 −3
Original line number Diff line number Diff line
@@ -46,9 +46,6 @@ class Model(object):
  @staticmethod
  def model_builder(model_type, task_types, model_params,
                    initialize_raw_model=True):
    print("model_builder()")
    print("model_params")
    print(model_params)
    if model_type in Model.registered_model_types:
      model = Model.registered_model_types[model_type](
          task_types, model_params, initialize_raw_model)
+18 −16
Original line number Diff line number Diff line
@@ -73,22 +73,23 @@ def add_train_test_command(subparsers):
      choices=["normalize-and-truncate"],
      help="Transforms to apply to input data.")
  train_test_cmd.add_argument(
      "--output-transforms", type=str, default="",
      help="Comma-separated list (no spaces) of transforms to apply to output data.\n"
           "Supported transforms are 'log' and 'normalize'. 'None' will be taken\n"
      "--output-transforms", nargs="+", default=[],
      help="Supported transforms are 'log' and 'normalize'. 'None' will be taken\n"
           "to mean no transforms are required.")
  train_test_cmd.add_argument(
      "--feature-types", type=str, required=1,
      help="Comma-separated list (no spaces) of types of featurizations to use.\n"
           "Each featurization must correspond to subdirectory in generated\n"
           "data directory.")
      "--feature-types", nargs="+", required=1,
      choices=["features", "fingerprints", "descriptors"],
      help="Featurizations of data to use.\n"
           "'features' denotes user-defined features.\n"
           "'fingerprints' denotes ECFP fingeprints.\n"
           "'descriptors' denotes RDKit chem descriptors.\n")
  train_test_cmd.add_argument(
      "--paths", nargs="+", required=1,
      help="Paths to input datasets.")
  train_test_cmd.add_argument(
      "--splittype", type=str, default="scaffold",
      choices=["scaffold", "random", "specified"],
      help="Type of train/test data-splitting. 'scaffold' uses Bemis-Murcko scaffolds.\n"
      help="Type of train/test data-split. 'scaffold' uses Bemis-Murcko scaffolds.\n"
           "specified requires that split be in original data.")
  train_test_cmd.add_argument(
      "--weight-positives", type=bool, default=False,
@@ -215,22 +216,23 @@ def add_model_command(subparsers):
      choices=["normalize-and-truncate"],
      help="Transforms to apply to input data.")
  train_test_group.add_argument(
      "--output-transforms", type=str, default="",
      help="Comma-separated list (no spaces) of transforms to apply to output data.\n"
           "Supported transforms are log and normalize.")
      "--output-transforms", nargs="+", default=[],
      help="Supported transforms are log and normalize.")
  train_test_group.add_argument(
      "--mode", default="singletask",
      choices=["singletask", "multitask"],
      help="Type of model being built.")
  train_test_group.add_argument(
      "--feature-types", type=str, required=1,
      help="Comma-separated list (no spaces) of types of featurizations to use.\n"
           "Each featurization must correspond to subdirectory in generated\n"
           "data directory.")
      "--feature-types", nargs="+", required=1,
      choices=["features", "fingerprints", "descriptors"],
      help="Featurizations of data to use.\n"
           "'features' denotes user-defined features.\n"
           "'fingerprints' denotes ECFP fingeprints.\n"
           "'descriptors' denotes RDKit chem descriptors.\n")
  train_test_group.add_argument(
      "--splittype", type=str, default="scaffold",
      choices=["scaffold", "random", "specified"],
      help="Type of train/test data-splitting. 'scaffold' uses Bemis-Murcko scaffolds.\n"
      help="Type of train/test data-split. 'scaffold' uses Bemis-Murcko scaffolds.\n"
           "specified requires that split be in original data.")

  add_model_group(model_cmd)
+5 −0
Original line number Diff line number Diff line
@@ -51,6 +51,11 @@ def compute_y_pred(model, data_dir, csv_out, split):

  split_df = metadata_df.loc[metadata_df['split'] == split]
  nb_batch = split_df.shape[0]
  print("compute_y_pred()")
  print("split_df.shape")
  print(split_df.shape)
  # TODO(rbharath/enf): This is only for GPU models, and is currently depends
  # on magic numbers.
  MAX_GPU_RAM = float(691007488/50)

  for i, row in split_df.iterrows():
+10 −7
Original line number Diff line number Diff line
@@ -101,10 +101,9 @@ def process_field(data, field_type):
  elif field_type == "ndarray":
    return data

def generate_scaffold(smiles_elt, include_chirality=False, smiles_field="smiles"):
def generate_scaffold(smiles, include_chirality=False, smiles_field="smiles"):
  """Compute the Bemis-Murcko scaffold for a SMILES string."""
  smiles_string = smiles_elt[smiles_field]
  mol = Chem.MolFromSmiles(smiles_string)
  mol = Chem.MolFromSmiles(smiles)
  engine = ScaffoldGenerator(include_chirality=include_chirality)
  scaffold = engine.get_scaffold(mol)
  return scaffold
@@ -222,10 +221,14 @@ def featurize_inputs(feature_dir, input_files, input_type, fields, field_types,
                     feature_fields, task_fields, smiles_field,
                     split_field, id_field, threshold):

  featurize_input_partial = partial(featurize_input, feature_dir=feature_dir, input_type=input_type, 
                                    fields=fields, field_types=field_types, feature_fields=feature_fields,
                                    task_fields=task_fields, smiles_field=smiles_field,
                                    split_field=split_field, id_field=id_field, threshold=threshold)
  featurize_input_partial = partial(featurize_input, feature_dir=feature_dir,
                                    input_type=input_type, fields=fields,
                                    field_types=field_types,
                                    feature_fields=feature_fields,
                                    task_fields=task_fields,
                                    smiles_field=smiles_field,
                                    split_field=split_field, id_field=id_field,
                                    threshold=threshold)

  #for input_file in input_files:
  #  featurize_input_partial(input_file)
+90 −392

File changed.

Preview size limit exceeded, changes collapsed.