Commit 9ffb217f authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Fixes to multitask loading pipeline.

parent 63ada38a
Loading
Loading
Loading
Loading
+29 −25
Original line number Diff line number Diff line
@@ -11,7 +11,7 @@ from deep_chem.utils.featurize import generate_targets
from deep_chem.utils.featurize import generate_features
from deep_chem.utils.featurize import generate_vs_utils_features
from deep_chem.models.standard import fit_singletask_models
from deep_chem.utils.load import get_target_names
#from deep_chem.utils.load import get_target_names
from deep_chem.utils.load import process_datasets
from deep_chem.utils.load import transform_data
from deep_chem.utils.evaluate import results_to_csv
@@ -51,7 +51,7 @@ def add_featurize_group(featurize_cmd):
      "--feature-fields", type=str, nargs="+",
      help="Optional field that holds pre-computed feature vector")
  featurize_group.add_argument(
      "--prediction-field", type=str, required=1,
      "--target-fields", type=str, nargs="+", required=1,
      help="Name of measured field to predict.")
  featurize_group.add_argument(
      "--split-field", type=str, default=None,
@@ -66,7 +66,7 @@ def add_featurize_group(featurize_cmd):
  # TODO(rbharath): This should be moved to train-tests-split
  featurize_group.add_argument(
      "--threshold", type=float, default=None,
      help="If specified, will be used to binarize real-valued prediction-field.")
      help="If specified, will be used to binarize real-valued target-fields.")
  featurize_group.add_argument(
      "--name", required=1,
      help="Name of the dataset.")
@@ -284,7 +284,7 @@ def create_model(args):
  if not args.skip_featurization:
    _featurize_input(
        args.name, args.out, args.input_file, args.input_type, args.fields,
        args.field_types, args.feature_fields, args.prediction_field,
        args.field_types, args.feature_fields, args.target_fields,
        args.smiles_field, args.split_field, args.id_field, args.threshold,
        args.delimiter)

@@ -296,7 +296,8 @@ def create_model(args):
  test_out = os.path.join(data_dir, "%s-test.pkl.gz" % args.name)
  _train_test_input(
      paths, args.output_transforms, args.input_transforms, args.feature_types,
      args.splittype, weight_positives, args.mode, train_out, test_out)
      args.splittype, weight_positives, args.mode, train_out, test_out,
      args.target_fields)

  print "+++++++++++++++++++++++++++++++++"
  print "Fit model"
@@ -306,7 +307,7 @@ def create_model(args):
  _fit_model(
      paths, args.model, args.task_type, args.n_hidden, args.learning_rate,
      args.dropout, args.n_epochs, args.decay, args.batch_size,
      args.validation_split, saved_out, train_out)
      args.validation_split, saved_out, train_out, args.target_fields)


  print "+++++++++++++++++++++++++++++++++"
@@ -327,13 +328,13 @@ def create_model(args):
  _eval_trained_model(
      modeltype, saved_out, train_out, paths, args.task_type, compute_aucs,
      compute_recall, compute_accuracy, compute_matthews_corrcoef, compute_r2s,
      compute_rms, csv_out_train, stats_out_train)
      compute_rms, csv_out_train, stats_out_train,  args.target_fields)
  print "Eval Model on Test"
  print "------------------"
  _eval_trained_model(
      modeltype, saved_out, test_out, paths, args.task_type, compute_aucs,
      compute_recall, compute_accuracy, compute_matthews_corrcoef, compute_r2s,
      compute_rms, csv_out_test, stats_out_test)
      compute_rms, csv_out_test, stats_out_test, args.target_fields)

def parse_args(input_args=None):
  """Parse command-line arguments."""
@@ -353,12 +354,12 @@ def featurize_input(args):
  """Wrapper function that calls _featurize_input with args unwrapped."""
  _featurize_input(
      args.name, args.out, args.input_file, args.input_type, args.fields,
      args.field_types, args.feature_fields, args.prediction_field,
      args.field_types, args.feature_fields, args.target_fields,
      args.smiles_field, args.split_field, args.id_field, args.threshold,
      args.delimiter)

def _featurize_input(name, out, input_file, input_type, fields, field_types,
                     feature_fields, prediction_field, smiles_field,
                     feature_fields, target_fields, smiles_field,
                     split_field, id_field, threshold, delimiter):
  """Featurizes raw input data."""
  if len(fields) != len(field_types):
@@ -367,10 +368,10 @@ def _featurize_input(name, out, input_file, input_type, fields, field_types,
    id_field = smiles_field
  out_x_pkl, out_y_pkl = generate_directories(name, out, feature_fields)
  df, mols = extract_data(
      input_file, input_type, fields, field_types, prediction_field,
      input_file, input_type, fields, field_types, target_fields,
      smiles_field, threshold, delimiter)
  print "Generating targets"
  generate_targets(df, prediction_field, split_field,
  generate_targets(df, target_fields, split_field,
                   smiles_field, id_field, out_y_pkl)
  print "Generating user-specified features"
  generate_features(df, feature_fields, smiles_field, id_field, out_x_pkl)
@@ -384,23 +385,24 @@ def train_test_input(args):
  _train_test_input(
      args.paths, args.output_transforms, args.input_transforms,
      args.feature_types, args.splittype, args.weight_positives, args.mode,
      args.train_out, args.test_out)
      args.train_out, args.test_out, args.target_fields)

def _train_test_input(paths, output_transforms, input_transforms,
                      feature_types, splittype, weight_positives, mode,
                      train_out, test_out):
                      train_out, test_out, target_names):
  """Saves transformed model."""
  targets = get_target_names(paths)
  #targets = get_target_names(paths)
  if output_transforms == "" or output_transforms == "None":
    output_transforms = []
  else:
    output_transforms = output_transforms.split(",")
  output_transforms_dict = {target: output_transforms for target in targets}
  output_transforms_dict = {target: output_transforms for target in target_names}
  feature_types = feature_types.split(",")
  train_dict, test_dict = process_datasets(
      paths, input_transforms, output_transforms_dict,
      feature_types=feature_types, splittype=splittype,
      weight_positives=weight_positives, mode=mode)
      weight_positives=weight_positives, mode=mode,
      target_names=target_names)
  trans_train_dict = transform_data(
      train_dict, input_transforms, output_transforms)
  trans_test_dict = transform_data(test_dict, input_transforms, output_transforms)
@@ -423,14 +425,15 @@ def fit_model(args):
  _fit_model(
      args.paths, args.model, args.task_type, args.n_hidden,
      args.learning_rate, args.dropout, args.n_epochs, args.decay,
      args.batch_size, args.validation_split, args.saved_out, args.saved_data)
      args.batch_size, args.validation_split, args.saved_out, args.saved_data,
      args.target_fields)

def _fit_model(paths, model, task_type, n_hidden, learning_rate, dropout,
               n_epochs, decay, batch_size, validation_split, saved_out,
               saved_data):
               saved_data, target_names):
  """Builds model from featurized data."""
  targets = get_target_names(paths)
  task_types = {target: task_type for target in targets}
  #targets = get_target_names(paths)
  task_types = {target: task_type for target in target_names}

  with gzip.open(saved_data) as data_file:
    stored_train = pickle.load(data_file)
@@ -484,16 +487,17 @@ def eval_trained_model(args):
      args.modeltype, args.saved_model, args.saved_data, args.paths,
      args.task_type, args.compute_aucs, args.compute_recall,
      args.compute_accuracy, args.compute_matthews_corrcoef, args.compute_r2s,
      args.compute_rms, args.csv_out, args.stats_out)
      args.compute_rms, args.csv_out, args.stats_out,
      args.target_fields)

def _eval_trained_model(modeltype, saved_model, saved_data, paths, task_type,
                        compute_aucs, compute_recall, compute_accuracy,
                        compute_matthews_corrcoef, compute_r2s, compute_rms,
                        csv_out, stats_out):
                        csv_out, stats_out, target_names):
  """Evaluates a trained model on specified data."""
  model = load_model(modeltype, saved_model)
  targets = get_target_names(paths)
  task_types = {target: task_type for target in targets}
  #targets = get_target_names(paths)
  task_types = {target: task_type for target in target_names}

  with gzip.open(saved_data) as data_file:
    stored_test = pickle.load(data_file)
+35 −21
Original line number Diff line number Diff line
@@ -54,34 +54,45 @@ def parse_float_input(val):
    if ">" in val or "<" in val or "-" in val:
      return np.nan

def generate_vs_utils_features(dataframe, name, out, smiles_field, id_field, featuretype):
def generate_vs_utils_features(dataframe, name, out, smiles_field, id_field,
    featuretype, log_every_n=1000):
  """Generates circular fingerprints for dataset."""
  dataset_dir = os.path.join(out, name)
  feature_dir = os.path.join(dataset_dir, featuretype)
  features = os.path.join(feature_dir, "%s-%s.pkl.gz" % (name, featuretype))
  features_file = os.path.join(feature_dir, "%s-%s.pkl.gz" % (name, featuretype))

  feature_df = pd.DataFrame([])
  feature_df["smiles"] = dataframe[[smiles_field]]
  feature_df["scaffolds"] = dataframe[[smiles_field]].apply(
      functools.partial(generate_scaffold, smiles_field=smiles_field),
      axis=1)
  feature_df["mol_id"] = dataframe[[id_field]]

  mols = []
  for row in dataframe.iterrows():
    # pandas rows are tuples (row_num, row_data)
    smiles = row[1][smiles_field]
    mols.append(Chem.MolFromSmiles(smiles))
  print "About to instantiate featurizer."
  if featuretype == "fingerprints":
    featurizer = CircularFingerprint(size=1024)
  elif featuretype == "descriptors":
    featurizer = SimpleDescriptors()
  else:
    raise ValueError("Unsupported featuretype requested.")
  print "About to generate features for molecules"
  features, mol = [], None
  smiles = dataframe[smiles_field].tolist()
  for row_ind, row_data in enumerate(smiles):
    if row_ind % log_every_n == 0:
      print "Featurizing molecule %d" % row_ind
    mol = Chem.MolFromSmiles(row_data)
    features.append(featurizer.featurize([mol]))
  print "Done generating features. About to transfer them to dataframe."
  feature_df = pd.DataFrame([])
  feature_df["features"] = pd.DataFrame(
      [{"features": feature} for feature in featurizer.featurize(mols)])
      [{"features": feature} for feature in features])
  #feature_df["features"] = pd.DataFrame(
  #    [{"features": feature} for feature in featurizer.featurize(mols)])

  print "Done transfering to dataframe. About to populate remaining df fields."
  feature_df["smiles"] = dataframe[[smiles_field]]
  feature_df["scaffolds"] = dataframe[[smiles_field]].apply(
      functools.partial(generate_scaffold, smiles_field=smiles_field),
      axis=1)
  feature_df["mol_id"] = dataframe[[id_field]]
  print "Populated 'smiles', 'scaffolds', 'mol_id' fields"

  with gzip.open(features, "wb") as gzip_file:
  print "About to write pkl.gz file"
  with gzip.open(features_file, "wb") as gzip_file:
    pickle.dump(feature_df, gzip_file, pickle.HIGHEST_PROTOCOL)

def get_rows(input_file, input_type, delimiter):
@@ -164,7 +175,7 @@ def process_field(data, field_type):
  elif field_type == "ndarray":
    return data

def generate_targets(dataframe, prediction_field, split_field,
def generate_targets(dataframe, target_fields, split_field,
                     smiles_field, id_field, out_pkl):
  """Process input data file, generate labels, i.e. y"""
  #TODO(enf, rbharath): Modify package unique identifier to take user-specified
@@ -172,7 +183,8 @@ def generate_targets(dataframe, prediction_field, split_field,
  labels_df = pd.DataFrame([])
  labels_df["mol_id"] = dataframe[[id_field]]
  labels_df["smiles"] = dataframe[[smiles_field]]
  labels_df["prediction"] = dataframe[[prediction_field]]
  for target in target_fields:
    labels_df[target] = dataframe[[target]]
  if split_field is not None:
    labels_df["split"] = dataframe[[split_field]]

@@ -214,11 +226,13 @@ def generate_features(dataframe, feature_fields, smiles_field, id_field, out_pkl
    pickle.dump(features_df, pickle_file, pickle.HIGHEST_PROTOCOL)

def extract_data(input_file, input_type, fields, field_types,
                 prediction_field, smiles_field, threshold, delimiter):
                 target_fields, smiles_field, threshold, delimiter,
                 log_every_n=1000):
  """Extracts data from input as Pandas data frame"""
  rows, mols, smiles = [], [], SmilesGenerator()
  colnames = []
  for row_index, raw_row in enumerate(get_rows(input_file, input_type, delimiter)):
    if row_index % log_every_n == 0:
      print row_index
    # Skip empty rows
    if raw_row is None:
@@ -231,7 +245,7 @@ def extract_data(input_file, input_type, fields, field_types,
      continue
    row, row_data = {}, get_row_data(raw_row, input_type, fields, smiles_field, colnames)
    for (field, field_type) in zip(fields, field_types):
      if field == prediction_field and threshold is not None:
      if field in target_fields and threshold is not None:
        raw_val = process_field(row_data[field], field_type)
        row[field] = 1 if raw_val > threshold else 0
      else:
+38 −40
Original line number Diff line number Diff line
@@ -19,7 +19,7 @@ from vs_utils.utils import ScaffoldGenerator

def process_datasets(paths, input_transforms, output_transforms,
    feature_types=["fingerprints"], mode="multitask",
    splittype="random", seed=None, weight_positives=True):
    splittype="random", seed=None, weight_positives=True, target_names=[]):
  """Extracts datasets and split into train/test.

  Returns a dict that maps target names to tuples.
@@ -36,7 +36,7 @@ def process_datasets(paths, input_transforms, output_transforms,
  seed: int
    Seed used for random splits.
  """
  dataset = load_datasets(paths, feature_types=feature_types)
  dataset = load_datasets(paths, feature_types=feature_types, target_names=target_names)
  train_dict, test_dict = {}, {}
  if mode == "singletask":
    singletask = multitask_to_singletask(dataset)
@@ -97,23 +97,23 @@ def load_molecules(paths, feature_types=["fingerprints"]):
              entry["feature_types"].append(feature_type)
  return molecules 

def get_target_names(paths, target_dir_name="targets"):
  """Get names of targets in provided collections.

  Parameters
  ----------
  paths: list 
    List of paths to base directory.
  """
  target_names = []
  for dataset_path in paths:
    target_dir = os.path.join(dataset_path, target_dir_name)
    target_names += [target_pickle.split(".")[0]
        for target_pickle in os.listdir(target_dir)
        if "pkl.gz" in target_pickle]
  return target_names

def load_assays(paths, target_dir_name="targets"):
#def get_target_names(paths, target_dir_name="targets"):
#  """Get names of targets in provided collections.
#
#  Parameters
#  ----------
#  paths: list 
#    List of paths to base directory.
#  """
#  target_names = []
#  for dataset_path in paths:
#    target_dir = os.path.join(dataset_path, target_dir_name)
#    target_names += [target_pickle.split(".")[0]
#        for target_pickle in os.listdir(target_dir)
#        if "pkl.gz" in target_pickle]
#  return target_names

def load_assays(paths, target_dir_name, target_names):
  """Load regression dataset labels from assays.

  Returns a dictionary that maps mol_id's to label vectors.
@@ -127,38 +127,36 @@ def load_assays(paths, target_dir_name="targets"):
  """
  labels, splits = {}, {}
  # Compute target names
  target_names = get_target_names(paths, target_dir_name)
  for dataset_path in paths:
    target_dir = os.path.join(dataset_path, target_dir_name)
    for target_pickle in os.listdir(target_dir):
      if "pkl.gz" not in target_pickle:
        continue
      target_name = target_pickle.split(".")[0]
      with gzip.open(os.path.join(target_dir, target_pickle), "rb") as f:
        contents = pickle.load(f)
        if "prediction" not in contents:
          raise ValueError("Prediction Endpoint Missing.")
        for ind, id in enumerate(contents["mol_id"]):
          measurement = contents["prediction"][ind]
          if "split" is not None:
            splits[id] = contents["split"][ind]
        for ind, mol_id in enumerate(contents["mol_id"]):
          if "split" in contents:
            splits[mol_id] = contents["split"][ind]
          else:
            splits[id] = None
            splits[mol_id] = None
          if mol_id not in labels:
            labels[mol_id] = {}
            # Ensure that each target has some entry in dict.
            for target_name in target_names:
              # Set all targets to invalid for now.
              labels[mol_id][target_name] = -1
          for target_name in target_names:
            measurement = contents[target_name][ind]
            try:
              if measurement is None or np.isnan(measurement):
                continue
            except TypeError:
              continue
          if id not in labels:
            labels[id] = {}
            # Ensure that each target has some entry in dict.
            for name in target_names:
              # Set all targets to invalid for now.
              labels[id][name] = -1
          labels[id][target_name] = measurement 
            labels[mol_id][target_name] = measurement 
  return labels, splits

def load_datasets(paths, target_dir_name="targets", feature_types=["fingerprints"]):
def load_datasets(paths, target_dir_name="targets", feature_types=["fingerprints"],
                  target_names=[]):
  """Load both labels and fingerprints.

  Returns a dictionary that maps mol_id's to pairs of (fingerprint, labels)
@@ -171,7 +169,7 @@ def load_datasets(paths, target_dir_name="targets", feature_types=["fingerprints
  """
  data = {}
  molecules = load_molecules(paths, feature_types)
  labels, splits = load_assays(paths, target_dir_name)
  labels, splits = load_assays(paths, target_dir_name, target_names)
  for ind, id in enumerate(molecules):
    if id not in labels:
      continue
+4 −4
Original line number Diff line number Diff line
@@ -144,8 +144,8 @@ def balance_positives(y, W):
    if to_next_target:
      continue
    n_positives, n_negatives = len(positive_inds), len(negative_inds)
    print "For target %d, n_positives: %d, n_negatives: %d" % (
        target_ind, n_positives, n_negatives)
    #print "For target %d, n_positives: %d, n_negatives: %d" % (
    #    target_ind, n_positives, n_negatives)
    # TODO(rbharath): This results since the coarse train/test split doesn't
    # guarantee that the test set actually has any positives for targets. FIX
    # THIS BEFORE RELEASE!
@@ -157,13 +157,13 @@ def balance_positives(y, W):
    W[negative_inds, target_ind] = 1
  return W

def dataset_to_numpy(dataset, weight_positives=True):
def dataset_to_numpy(dataset, weight_positives=False):
  """Transforms a set of tensor data into numpy arrays (X, y)"""
  n_samples = len(dataset.keys())
  sample_datapoint = dataset.itervalues().next()
  feature_shape = np.shape(sample_datapoint["fingerprint"])
  n_targets = len(sample_datapoint["labels"])
  X = np.squeeze(np.zeros((n_samples,) + feature_shape + (n_targets,)))
  X = np.squeeze(np.zeros((n_samples,) + feature_shape))
  y = np.zeros((n_samples, n_targets))
  W = np.ones((n_samples, n_targets))
  sorted_ids = sorted(dataset.keys())