Commit a2e91325 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Cleaned up modeler some.

parent 6b80747e
Loading
Loading
Loading
Loading
+6 −14
Original line number Diff line number Diff line
@@ -30,14 +30,7 @@ def add_featurize_group(featurize_cmd):
           "containing a pandas dataframe. If sdf, should be in\n"
           "(perhaps gzipped) sdf file.")
  featurize_group.add_argument(
      "--fields", required=1, nargs="+",
      help="Names of fields.")
  featurize_group.add_argument(
      "--field-types", required=1, nargs="+",
      choices=["string", "float", "list-string", "list-float", "ndarray"],
      help="Type of data in fields.")
  featurize_group.add_argument(
      "--feature-fields", type=str, nargs="+",
      "--user-specified-features", type=str, nargs="+",
      help="Optional field that holds pre-computed feature vector")
  featurize_group.add_argument(
      "--task-fields", type=str, nargs="+", required=1,
@@ -52,7 +45,6 @@ def add_featurize_group(featurize_cmd):
      "--id-field", type=str, default=None,
      help="Name of field specifying unique identifier for molecule.\n"
           "If none is specified, then smiles-field is used as identifier.")
  # TODO(rbharath): This should be moved to train-tests-split
  featurize_group.add_argument(
      "--threshold", type=float, default=None,
      help="If specified, will be used to binarize real-valued target-fields.")
@@ -80,7 +72,7 @@ def add_transforms_group(cmd):
      help="Type of model being built.")
  transform_group.add_argument(
      "--feature-types", nargs="+", required=1,
      choices=["features", "fingerprints", "descriptors"],
      choices=["user-specified-features", "ECFP", "RDKIT-descriptors"],
      help="Featurizations of data to use.\n"
           "'features' denotes user-defined features.\n"
           "'fingerprints' denotes ECFP fingeprints.\n"
@@ -250,8 +242,8 @@ def create_model(args):
  print("Perform featurization")
  if not args.skip_featurization:
    featurize_inputs(
        feature_dir, args.input_files, args.input_type, args.fields,
        args.field_types, args.feature_fields, args.task_fields,
        feature_dir, args.input_files, args.input_type,
        args.user_specified_features, args.task_fields,
        args.smiles_field, args.split_field, args.id_field, args.threshold)

  print("+++++++++++++++++++++++++++++++++")
@@ -310,7 +302,7 @@ def featurize_inputs_wrapper(args):
    os.makedirs(args.feature_dir)
  featurize_inputs(
      args.feature_dir, args.input_files, args.input_type, args.fields,
      args.field_types, args.feature_fields, args.task_fields,
      args.field_types, args.user_specified_features, args.task_fields,
      args.smiles_field, args.split_field, args.id_field, args.threshold)

def train_test_split_wrapper(args):
+27 −22
Original line number Diff line number Diff line
@@ -63,24 +63,28 @@ class Samples(object):
    else:
      self.df = df

  # TODO(rbharath): There are some tricky issues dealing with information
  # separation between Samples and FeaturizedSamples that affect the semantics
  # of load and save. Punting for the moment to get something that runs.
  def save(self, out):
    """
    Saves samples to disk.
    """
    sample_params = {"input_file": self.input_file,
                     "tasks": self.tasks,
                     "smiles_field": self.smiles_field,
                     "split_field": self.split_field,
                     "id_field": self.id_field,
                     "threshold": self.threshold,
                     "user_specified_features": self.user_specified_features,
                     "df": self.df}
    save_to_disk(sample_params, out)

  @staticmethod
  def load(out):
    sample_params = load_from_disk(out)
    return Samples(**sample_params)
    #sample_params = {"input_file": self.input_file,
    #                 "tasks": self.tasks,
    #                 "smiles_field": self.smiles_field,
    #                 "split_field": self.split_field,
    #                 "id_field": self.id_field,
    #                 "threshold": self.threshold,
    #                 "user_specified_features": self.user_specified_features,
    #                 "df": self.df}
    #save_to_disk(sample_params, out)
    save_to_disk(self.df, out)

  #@staticmethod
  #def load(out):
  #  sample_params = load_from_disk(out)
  #  return Samples(**sample_params)

  def get_samples(self):
    """Accessor for samples in this object."""
@@ -189,14 +193,14 @@ class Samples(object):
        for feature in user_specified_features:
          feature_list.append(row[feature])
        features_data.append({"row": np.array(feature_list)})
      df["features"] = pd.DataFrame(features_data)
      df["user-specified-features"] = pd.DataFrame(features_data)
    return df

  def featurize(self, featuretype, log_every_n=1000):
    """Generates circular fingerprints for dataset."""
    if featuretype == "ECFP":
      featurizer = CircularFingerprint(size=1024)
    elif featuretype == "descriptors":
    elif featuretype == "RDKIT-descriptors":
      featurizer = SimpleDescriptors()
    else:
      raise ValueError("Unsupported featuretype requested.")
@@ -220,23 +224,24 @@ def featurize_input(feature_dir, input_file, user_specified_features, tasks,
  print("Generating circular fingerprints")
  samples.featurize("ECFP")
  print("Generating rdkit descriptors")
  add_vs_utils_features(df, "descriptors")
  samples.featurize("RDKIT-descriptors")
  df_filename = os.path.join(
      feature_dir, "%s.joblib" %(os.path.splitext(os.path.basename(input_file))[0]))
  print("Saving samples to disk.")
  samples.save(df_filename)

def featurize_inputs(feature_dir, input_files, input_type, fields, field_types,
def featurize_inputs(feature_dir, input_files,
                     feature_fields, task_fields, smiles_field,
                     split_field, id_field, threshold):

  featurize_input_partial = partial(featurize_input, feature_dir=feature_dir,
                                    input_type=input_type, fields=fields,
                                    field_types=field_types,
  featurize_input_partial = partial(featurize_input,
                                    feature_dir=feature_dir,
                                    input_type=input_type,
                                    feature_fields=feature_fields,
                                    task_fields=task_fields,
                                    smiles_field=smiles_field,
                                    split_field=split_field, id_field=id_field,
                                    split_field=split_field,
                                    id_field=id_field,
                                    threshold=threshold)

  pool = mp.Pool(int(mp.cpu_count()/2))