Commit c3c2d080 authored by evanfeinberg's avatar evanfeinberg
Browse files

started making accept multiple feature files

parent f2dca20f
Loading
Loading
Loading
Loading
+29 −10
Original line number Diff line number Diff line
@@ -32,12 +32,12 @@ def add_featurize_group(featurize_cmd):
  """Adds flags for featurizization."""
  featurize_group = featurize_cmd.add_argument_group("Input Specifications")
  featurize_group.add_argument(
      "--input-file", required=1,
      "--input-files", required=1, nargs="+",
      help="Input file with data.")
  featurize_group.add_argument(
      "--input-type", default="csv",
      choices=["xlsx", "csv", "pandas", "sdf"],
      help="Type of input file. If pandas, input must be a pkl.gz\n"
      help="Type of input file. If pandas, input must be a joblib.gz\n"
           "containing a pandas dataframe. If sdf, should be in\n"
           "(perhaps gzipped) sdf file.")
  featurize_group.add_argument(
@@ -357,13 +357,15 @@ def parse_args(input_args=None):

def featurize_input(args):
  """Wrapper function that calls _featurize_input with args unwrapped."""
  _featurize_input(
  _featurize_inputs(
      args.name, args.out, args.input_file, args.input_type, args.fields,
      args.field_types, args.feature_fields, args.prediction_field,
      args.smiles_field, args.split_field, args.id_field, args.threshold,
      args.delimiter)

def _featurize_input(name, out, input_file, input_type, fields, field_types,
#make this helper and add a wrapper function that has "input files" and add multiprocessing option
#shard into 10x at this step (make a flag)
def _featurize_input(input_file, name, out, input_type, fields, field_types,
                     feature_fields, prediction_field, smiles_field,
                     split_field, id_field, threshold, delimiter):
  """Featurizes raw input data."""
@@ -371,19 +373,31 @@ def _featurize_input(name, out, input_file, input_type, fields, field_types,
    raise ValueError("number of fields does not equal number of field types")
  if id_field is None:
    id_field = smiles_field
  out_x_pkl, out_y_pkl = generate_directories(name, out, feature_fields)
  out_x_joblib, out_y_joblib = generate_directories(name, input_file, out, feature_fields)
  df, mols = extract_data(
      input_file, input_type, fields, field_types, prediction_field,
      smiles_field, threshold, delimiter)
  print "Generating targets"
  generate_targets(df, prediction_field, split_field,
                   smiles_field, id_field, out_y_pkl)
                   smiles_field, id_field, out_y_joblib)
  print "Generating user-specified features"
  generate_features(df, feature_fields, smiles_field, id_field, out_x_pkl)
  generate_features(df, feature_fields, smiles_field, id_field, out_x_joblib)
  print "Generating circular fingerprints"
  generate_vs_utils_features(df, name, out, smiles_field, id_field, "fingerprints")
  generate_vs_utils_features(df, name, input_file, out, smiles_field, id_field, "fingerprints")
  print "Generating rdkit descriptors"
  generate_vs_utils_features(df, name, out, smiles_field, id_field, "descriptors")
  generate_vs_utils_features(df, name, input_file, out, smiles_field, id_field, "descriptors")

def _featurize_inputs(name, out, input_files, input_type, fields, field_types,
                     feature_fields, prediction_field, smiles_field,
                     split_field, id_field, threshold, delimiter):
  
  other_arguments = (name, out, input_type, fields, field_types,
                     feature_fields, prediction_field, smiles_field,
                     split_field, id_field, threshold, delimiter)
  pool = mp.Pool(mp.cpu_count())
  pool.map(_featurize_input, itertools.izip(input_files, itertools.repeat(other_arguments)))
  pool.terminate()
  

def train_test_input(args):
  """Wrapper function that calls _train_test_input after unwrapping args."""
@@ -392,6 +406,7 @@ def train_test_input(args):
      args.feature_types, args.splittype, args.weight_positives, args.mode,
      args.train_out, args.test_out)

#decompose this into: a) compute train test split using only smiles.  b) for each shard, make a train test numpy array 
def _train_test_input(paths, output_transforms, input_transforms,
                      feature_types, splittype, weight_positives, mode,
                      train_out, test_out):
@@ -428,6 +443,10 @@ def _train_test_input(paths, output_transforms, input_transforms,
  save_sharded_dataset(stored_train, train_out)
  save_sharded_dataset(stored_test, test_out)

def _train_test_inputs(paths, output_transforms, input_transforms,
                      feature_types, splittype, weight_positives, mode,
                      train_out, test_out):

def fit_model(args):
  """Wrapper that calls _fit_model with arguments unwrapped."""
  # TODO(rbharath): Bundle these arguments up into a training_params dict.
@@ -486,7 +505,7 @@ def get_model_extension(modeltype):
  if modeltype == "sklearn":
    return "joblib"
  elif modeltype == "autograd":
    return "pkl.gz"
    return "joblib.gz"
  elif modeltype == "keras-graph" or modeltype == "keras-sequential":
    return "h5"

+0 −1
Original line number Diff line number Diff line
@@ -34,7 +34,6 @@ def compute_model_performance(raw_test_data, test_data, task_types, models,
    print("Target %s" % target, file=print_file)
    (test_ids, X_test, y_test, w_test) = test_data[target]
    (_, _, ytest_raw, _) = raw_test_data[target]
    #model = models[target]
    model = models.itervalues().next()
    results = eval_model(
        test_ids, X_test, y_test, ytest_raw, w_test, model,
+20 −14
Original line number Diff line number Diff line
@@ -14,11 +14,13 @@ from vs_utils.utils import SmilesGenerator, ScaffoldGenerator
from vs_utils.features.fingerprints import CircularFingerprint
from vs_utils.features.basic import SimpleDescriptors

def generate_directories(name, out, feature_fields):
def generate_directories(name, input_file, out, feature_fields):
  """Generate directory structure for featurized dataset."""
  dataset_dir = os.path.join(out, name)
  if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)

  '''
  fingerprint_dir = os.path.join(dataset_dir, "fingerprints")
  if not os.path.exists(fingerprint_dir):
    os.makedirs(fingerprint_dir)
@@ -34,12 +36,15 @@ def generate_directories(name, out, feature_fields):
      os.makedirs(feature_field_dir)

  # Return names of files to be generated
  # TODO(rbharath): Explicitly passing around out_*_pkl is an encapsulation
  # TODO(rbharath): Explicitly passing around out_*_joblib is an encapsulation
  # failure. Remove this.
  out_y_pkl = os.path.join(target_dir, "%s.pkl.gz" % name)
  out_x_pkl = (os.path.join(feature_field_dir, "%s-features.pkl.gz" %name)
  basename = os.path.basename(input_file)
  '''

  out_y_joblib = os.path.join(dataset_dir, "%s_%s.joblib" %(name, basename))
  out_x_joblib = (os.path.join(dataset_dir, "%s_%s-features.joblib" %(name, basename))
               if feature_fields is not None else None)
  return out_x_pkl, out_y_pkl
  return out_x_joblib, out_y_joblib

def parse_float_input(val):
  """Safely parses a float input."""
@@ -54,11 +59,13 @@ def parse_float_input(val):
    if ">" in val or "<" in val or "-" in val:
      return np.nan

def generate_vs_utils_features(dataframe, name, out, smiles_field, id_field, featuretype):
def generate_vs_utils_features(dataframe, name, input_file, out, smiles_field, id_field, featuretype):
  """Generates circular fingerprints for dataset."""
  dataset_dir = os.path.join(out, name)
  feature_dir = os.path.join(dataset_dir, featuretype)
  features = os.path.join(feature_dir, "%s-%s.pkl.gz" % (name, featuretype))
  shard_index_0 = input_file.split("_")[1]
  shard_index_1 = input_file.split("_")[2]
  features = os.path.join(feature_dir, "%s-%s-%d-%d.joblib" % (name, featuretype, shard_index_0, shard_index_1))

  feature_df = pd.DataFrame([])
  feature_df["smiles"] = dataframe[[smiles_field]]
@@ -81,8 +88,7 @@ def generate_vs_utils_features(dataframe, name, out, smiles_field, id_field, fea
  feature_df["features"] = pd.DataFrame(
      [{"features": feature} for feature in featurizer.featurize(mols)])

  with gzip.open(features, "wb") as gzip_file:
    pickle.dump(feature_df, gzip_file, pickle.HIGHEST_PROTOCOL)
  save_sharded_dataset(features_df, features)

def get_rows(input_file, input_type, delimiter):
  """Returns an iterator over all rows in input_file"""
@@ -165,7 +171,7 @@ def process_field(data, field_type):
    return data

def generate_targets(dataframe, prediction_field, split_field,
                     smiles_field, id_field, out_pkl):
                     smiles_field, id_field, out_joblib):
  """Process input data file, generate labels, i.e. y"""
  #TODO(enf, rbharath): Modify package unique identifier to take user-specified
    #unique identifier instead of assuming smiles string
@@ -176,8 +182,8 @@ def generate_targets(dataframe, prediction_field, split_field,
  if split_field is not None:
    labels_df["split"] = dataframe[[split_field]]

  # Write pkl.gz file
  with gzip.open(out_pkl, "wb") as pickle_file:
  # Write joblib file
  with gzip.open(out_joblib, "wb") as pickle_file:
    pickle.dump(labels_df, pickle_file, pickle.HIGHEST_PROTOCOL)

def generate_scaffold(smiles_elt, include_chirality=False, smiles_field="smiles"):
@@ -188,7 +194,7 @@ def generate_scaffold(smiles_elt, include_chirality=False, smiles_field="smiles"
  scaffold = engine.get_scaffold(mol)
  return scaffold

def generate_features(dataframe, feature_fields, smiles_field, id_field, out_pkl):
def generate_features(dataframe, feature_fields, smiles_field, id_field, out_joblib):
  """Puts user defined features into a standard directory structure."""
  if feature_fields is None:
    print("No feature field specified by user.")
@@ -210,7 +216,7 @@ def generate_features(dataframe, feature_fields, smiles_field, id_field, out_pkl
    features_data.append({"row": np.array(feature_list)})
  features_df["features"] = pd.DataFrame(features_data)

  with gzip.open(out_pkl, "wb") as pickle_file:
  with gzip.open(out_joblib, "wb") as pickle_file:
    pickle.dump(features_df, pickle_file, pickle.HIGHEST_PROTOCOL)

def extract_data(input_file, input_type, fields, field_types,
+38 −37
Original line number Diff line number Diff line
@@ -8,7 +8,7 @@ __license__ = "LGPL"
import gzip
import numpy as np
import os
import cPickle as pickle
import cjoblib as joblib
from deep_chem.utils.preprocess import transform_outputs
from deep_chem.utils.preprocess import transform_inputs
from deep_chem.utils.preprocess import dataset_to_numpy
@@ -19,7 +19,7 @@ from vs_utils.utils import ScaffoldGenerator

def process_datasets(paths, input_transforms, output_transforms,
    feature_types=["fingerprints"], mode="multitask",
    splittype="random", seed=None, weight_positives=True):
    splittype="random", seed=None, weight_positives=True, shard_id=None):
  """Extracts datasets and split into train/test.

  Returns a dict that maps target names to tuples.
@@ -65,7 +65,7 @@ def process_datasets(paths, input_transforms, output_transforms,
  print np.shape(target[1])
  return train_dict, test_dict 

def load_molecules(paths, feature_types=["fingerprints"]):
def load_molecules(paths, feature_types=["fingerprints"], shard_id=None):
  """Load dataset fingerprints and return fingerprints.

  Returns a dictionary that maps smiles strings to dicts that contain
@@ -79,13 +79,14 @@ def load_molecules(paths, feature_types=["fingerprints"]):
  molecules = {}
  for dataset_path in paths:
    for feature_type in feature_types:
      pickle_dir = os.path.join(dataset_path, feature_type)
      pickle_files = os.listdir(pickle_dir)
      if len(pickle_files) == 0:
        raise ValueError("No Pickle Files found to load molecules")
      for pickle_file in pickle_files:
        with gzip.open(os.path.join(pickle_dir, pickle_file), "rb") as f:
          contents = pickle.load(f)
      joblib_dir = os.path.join(dataset_path, feature_type)
      joblib_files = os.listdir(joblib_dir)
      if shard_id is not None:
        joblib_files = [f for f in joblib_files if shard_id in f]
      if len(joblib_files) == 0:
        raise ValueError("No joblib Files found to load molecules")
      for joblib_file in joblib_files:
        contents = load_sharded_dataset(os.path.join(joblib_dir, joblib_file))
        smiles, features, scaffolds, mol_ids = (
            contents["smiles"], contents["features"],
            contents["scaffolds"], contents["mol_id"])
@@ -116,9 +117,9 @@ def get_target_names(paths, target_dir_name="targets"):
  target_names = []
  for dataset_path in paths:
    target_dir = os.path.join(dataset_path, target_dir_name)
    target_names += [target_pickle.split(".")[0]
        for target_pickle in os.listdir(target_dir)
        if "pkl.gz" in target_pickle]
    target_names += [target_joblib.split(".")[0]
        for target_joblib in os.listdir(target_dir)
        if "pkl.gz" in target_joblib]
  return target_names

def load_assays(paths, target_dir_name="targets"):
@@ -138,12 +139,12 @@ def load_assays(paths, target_dir_name="targets"):
  target_names = get_target_names(paths, target_dir_name)
  for dataset_path in paths:
    target_dir = os.path.join(dataset_path, target_dir_name)
    for target_pickle in os.listdir(target_dir):
      if "pkl.gz" not in target_pickle:
    for target_joblib in os.listdir(target_dir):
      if "pkl.gz" not in target_joblib:
        continue
      target_name = target_pickle.split(".")[0]
      with gzip.open(os.path.join(target_dir, target_pickle), "rb") as f:
        contents = pickle.load(f)
      target_name = target_joblib.split(".")[0]
      with gzip.open(os.path.join(target_dir, target_joblib), "rb") as f:
        contents = joblib.load(f)
        if "prediction" not in contents:
          raise ValueError("Prediction Endpoint Missing.")
        for ind, id in enumerate(contents["mol_id"]):
@@ -166,7 +167,7 @@ def load_assays(paths, target_dir_name="targets"):
          labels[id][target_name] = measurement 
  return labels, splits

def load_datasets(paths, target_dir_name="targets", feature_types=["fingerprints"]):
def load_datasets(paths, target_dir_name="targets", feature_types=["fingerprints"], shard_id=None):
  """Load both labels and fingerprints.

  Returns a dictionary that maps mol_id's to pairs of (fingerprint, labels)
@@ -178,7 +179,7 @@ def load_datasets(paths, target_dir_name="targets", feature_types=["fingerprints
    Paths to base directory.
  """
  data = {}
  molecules = load_molecules(paths, feature_types)
  molecules = load_molecules(paths, feature_types, shard_id=shard_id)
  labels, splits = load_assays(paths, target_dir_name)
  for ind, id in enumerate(molecules):
    if id not in labels: