Commit b77c93de authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Removed defunct add_descriptor support.

parent b1011fe3
Loading
Loading
Loading
Loading
+35 −85
Original line number Diff line number Diff line
@@ -21,8 +21,8 @@ from deep_chem.utils.evaluate import compute_rms_scores
from deep_chem.utils.evaluate import compute_roc_auc_scores
from deep_chem.utils.load import load_and_transform_dataset

def process_multitask(paths, task_transforms, desc_transforms, splittype="random",
    seed=None, add_descriptors=False, weight_positives=False, desc_weight=0.5):
def process_multitask(paths, task_transforms, splittype="random",
    seed=None, weight_positives=False):
  """Extracts multitask datasets and splits into train/test.

  Returns a tuple of test/train datasets, fingerprints, and labels.
@@ -36,17 +36,14 @@ def process_multitask(paths, task_transforms, desc_transforms, splittype="random
    List of paths to Google vs datasets. 
  task_transforms: dict 
    dict mapping target names to label transform. Each output type must be either
    None or "log". Only for regression outputs.
  desc_transforms: dict
    dict mapping descriptor number to transform. Each transform must be
    either None, "log", "normalize", or "log-normalize"
    None, "log", "normalize" or "log-normalize". Only for regression outputs.
  splittype: string
    Must be "random" or "scaffold"
  seed: int
    Seed used for random splits.
  """
  dataset = load_and_transform_dataset(paths, task_transforms, desc_transforms,
      add_descriptors=add_descriptors, weight_positives=weight_positives)
  dataset = load_and_transform_dataset(paths, task_transforms,
      weight_positives=weight_positives)
  sorted_targets = sorted(dataset.keys())
  if splittype == "random":
    train, test = train_test_random_split(dataset, seed=seed)
@@ -54,20 +51,18 @@ def process_multitask(paths, task_transforms, desc_transforms, splittype="random
    train, test = train_test_scaffold_split(dataset)
  else:
    raise ValueError("Improper splittype. Must be random/scaffold.")
  X_train, y_train, W_train = dataset_to_numpy(train,
      add_descriptors=add_descriptors, desc_weight=desc_weight)
  X_train, y_train, W_train = dataset_to_numpy(train)
  if weight_positives:
    print "Train set balance"
    ensure_balanced(y_train, W_train)
  X_test, y_test, W_test = dataset_to_numpy(test,
      add_descriptors=add_descriptors, desc_weight=desc_weight)
  X_test, y_test, W_test = dataset_to_numpy(test)
  if weight_positives:
    print "Test set balance"
    ensure_balanced(y_test, W_test)
  return (train, X_train, y_train, W_train, test, X_test, y_test, W_test)

def process_singletask(paths, task_transforms, desc_transforms, splittype="random", seed=None,
    add_descriptors=False, desc_weight=0.5, weight_positives=True):
def process_singletask(paths, task_transforms, splittype="random", seed=None,
    weight_positives=True):
  """Extracts singletask datasets and splits into train/test.

  Returns a dict that maps target names to tuples.
@@ -84,16 +79,12 @@ def process_singletask(paths, task_transforms, desc_transforms, splittype="rando
  seed: int
    Seed used for random splits.
  """
  dataset = load_and_transform_dataset(paths, task_transforms, desc_transforms,
      add_descriptors=add_descriptors, weight_positives=weight_positives)
  dataset = load_and_transform_dataset(paths, task_transforms,
      weight_positives=weight_positives)
  singletask = multitask_to_singletask(dataset)
  arrays = {}
  for target in singletask:
    print target
    data = singletask[target]
    print "len(data)"
    print len(data)
    # TODO(rbharath): Remove limitation after debugging.
    if len(data) == 0:
      continue
    if splittype == "random":
@@ -102,18 +93,15 @@ def process_singletask(paths, task_transforms, desc_transforms, splittype="rando
      train, test = train_test_scaffold_split(data)
    else:
      raise ValueError("Improper splittype. Must be random/scaffold.")
    X_train, y_train, W_train = dataset_to_numpy(train,
        add_descriptors=add_descriptors, desc_weight=desc_weight)
    X_test, y_test, W_test = dataset_to_numpy(test,
        add_descriptors=add_descriptors, desc_weight=desc_weight)
    X_train, y_train, W_train = dataset_to_numpy(train)
    X_test, y_test, W_test = dataset_to_numpy(test)
    arrays[target] = (train, X_train, y_train, W_train, test, X_test, y_test,
        W_test)
  return arrays


def fit_multitask_mlp(paths, task_types, task_transforms, desc_transforms,
                      splittype="random", add_descriptors=False, desc_weight=0.5,
                      weight_positives=False, **training_params):
def fit_multitask_mlp(paths, task_types, task_transforms,
                      splittype="random", weight_positives=False, **training_params):
  """
  Perform stochastic gradient descent optimization for a keras multitask MLP.
  Returns AUCs, R^2 scores, and RMS values.
@@ -127,30 +115,18 @@ def fit_multitask_mlp(paths, task_types, task_transforms, desc_transforms,
    "classification" or "regression".
  task_transforms: dict 
    dict mapping target names to label transform. Each output type must be either
    None or "log". Only for regression outputs.
  desc_transforms: dict
    dict mapping descriptor number to transform. Each transform must be
    either None, "log", "normalize", or "log-normalize"
  add_descriptors: bool
    Add descriptor prediction as extra task.
    None, "log", "normalize", or "log-normalize". Only for regression outputs.
  training_params: dict
    Aggregates keyword parameters to pass to train_multitask_model
  """
  (train, X_train, y_train, W_train, test, X_test, y_test, W_test) = (
      process_multitask(paths, task_transforms, desc_transforms,
      splittype=splittype, add_descriptors=add_descriptors, desc_weight=desc_weight,
      process_multitask(paths, task_transforms, splittype=splittype,
      weight_positives=weight_positives))
  print np.shape(y_train)
  model = train_multitask_model(X_train, y_train, W_train, task_types,
                                desc_transforms, add_descriptors=add_descriptors,
                                **training_params)
  results = eval_model(test, model, task_types, desc_transforms,
      add_descriptors=add_descriptors, modeltype="keras_multitask")
  if add_descriptors:
    local_task_types = task_types.copy()
    for desc in desc_transforms:
      local_task_types[desc] = "regression"
  else:
  results = eval_model(test, model, task_types,
      modeltype="keras_multitask")
  local_task_types = task_types.copy()
  aucs = compute_roc_auc_scores(results, local_task_types)
  if aucs:
@@ -160,9 +136,8 @@ def fit_multitask_mlp(paths, task_types, task_transforms, desc_transforms,
    print "Mean R^2: %f" % np.mean(np.array(r2s.values()))

def fit_singletask_mlp(paths, task_types, task_transforms,
                       desc_transforms, splittype="random",
                       add_descriptors=False, desc_weight=0.5,
                       weight_positives=True, num_to_train=None, **training_params):
                       splittype="random", weight_positives=True,
                       num_to_train=None, **training_params):
  """
  Perform stochastic gradient descent optimization for a keras MLP.

@@ -174,15 +149,11 @@ def fit_singletask_mlp(paths, task_types, task_transforms,
  task_transforms: dict 
    dict mapping target names to label transform. Each output type must be either
    None or "log". Only for regression outputs.
  desc_transforms: dict
    dict mapping descriptor number to transform. Each transform must be
    either None, "log", "normalize", or "log-normalize"
  training_params: dict
    Aggregates keyword parameters to pass to train_multitask_model
  """
  singletasks = process_singletask(paths, task_transforms, desc_transforms,
    splittype=splittype, add_descriptors=add_descriptors,
    desc_weight=desc_weight, weight_positives=weight_positives)
  singletasks = process_singletask(paths, task_transforms,
    splittype=splittype, weight_positives=weight_positives)
  ret_vals = {}
  aucs, r2s, rms = {}, {}, {}
  sorted_targets = sorted(singletasks.keys())
@@ -190,18 +161,15 @@ def fit_singletask_mlp(paths, task_types, task_transforms,
    sorted_targets = sorted_targets[:num_to_train]
  for index, target in enumerate(sorted_targets):
    print "Training model %d" % index
    print "Target %s" % target
    (train, X_train, y_train, W_train, test, X_test, y_test, W_test) = (
        singletasks[target])
    model = train_multitask_model(X_train, y_train, W_train,
        {target: task_types[target]}, desc_transforms, add_descriptors=add_descriptors,
        **training_params)
        {target: task_types[target]}, **training_params)
    results = eval_model(test, model, {target: task_types[target]}, 
                         desc_transforms,
                         # We run singletask models as special cases of
                         # multitask.
                         modeltype="keras_multitask",
                         add_descriptors=add_descriptors)
    print "Target %s" % target
                         modeltype="keras_multitask")
    target_aucs = compute_roc_auc_scores(results, task_types)
    target_r2s = compute_r2_scores(results, task_types)
    target_rms = compute_rms_scores(results, task_types)
@@ -219,18 +187,14 @@ def fit_singletask_mlp(paths, task_types, task_transforms,
    print rms
    print "Mean RMS: %f" % np.mean(np.array(rms.values()))

def train_multitask_model(X, y, W, task_types, desc_transforms, add_descriptors=False,
                      learning_rate=0.01, decay=1e-6,
                      momentum=0.9, nesterov=True, activation="relu",
                      dropout=0.5, nb_epoch=20, batch_size=50, n_hidden=500,
                      n_input=1024, validation_split=0.1):
def train_multitask_model(X, y, W, task_types,
  learning_rate=0.01, decay=1e-6, momentum=0.9, nesterov=True, activation="relu",
  dropout=0.5, nb_epoch=20, batch_size=50, n_hidden=500, n_input=1024,
  validation_split=0.1):
  """
  Perform stochastic gradient descent optimization for a keras multitask MLP.
  Returns a trained model.

  TODO(rbharath): The handling of add_descriptors for semi-supervised learning
  is horrible. Refactor.

  Parameters
  ----------
  X: np.ndarray
@@ -242,11 +206,6 @@ def train_multitask_model(X, y, W, task_types, desc_transforms, add_descriptors=
  task_types: dict 
    dict mapping target names to output type. Each output type must be either
    "classification" or "regression".
  desc_transforms: dict
    dict mapping descriptor number to transform. Each transform must be
    either None, "log", "normalize", or "log-normalize"
  add_descriptors: bool
    Add descriptor prediction as extra task.
  learning_rate: float
    Learning rate used.
  decay: float
@@ -261,16 +220,8 @@ def train_multitask_model(X, y, W, task_types, desc_transforms, add_descriptors=
  eps = .001
  num_tasks = len(task_types)
  sorted_targets = sorted(task_types.keys())
  if add_descriptors:
    sorted_descriptors = sorted(desc_transforms.keys())
    endpoints = sorted_targets + sorted_descriptors
    local_task_types = task_types.copy()
    for desc in desc_transforms:
      local_task_types[desc] = "regression"
  else:
  local_task_types = task_types.copy()
  endpoints = sorted_targets
  print "endpoints: " + str(endpoints)
  # Add eps weight to avoid minibatches with zero weight (causes theano to crash).
  W = W + eps * np.ones(np.shape(W))
  model = Graph()
@@ -308,7 +259,6 @@ def train_multitask_model(X, y, W, task_types, desc_transforms, add_descriptors=
  model.compile(optimizer=sgd, loss=loss_dict)
  print "Done compiling. About to fit model!"
  print "validation_split: " + str(validation_split)
  print "decay: " + str(decay)
  model.fit(data_dict, nb_epoch=nb_epoch, batch_size=batch_size, validation_split=validation_split,
            sample_weight=sample_weights)
  model.fit(data_dict, nb_epoch=nb_epoch, batch_size=batch_size,
    validation_split=validation_split, sample_weight=sample_weights)
  return model
+3 −9
Original line number Diff line number Diff line
@@ -24,8 +24,7 @@ from sklearn.linear_model import LassoLarsCV
from sklearn.svm import SVR

def fit_singletask_models(paths, modeltype, task_types, task_transforms,
    add_descriptors=False, desc_transforms={}, splittype="random",
    seed=None, num_to_train=None):
    splittype="random", seed=None, num_to_train=None):
  """Fits singletask linear regression models to potency.

  Parameters
@@ -44,12 +43,8 @@ def fit_singletask_models(paths, modeltype, task_types, task_transforms,
  task_transforms: dict 
    dict mapping target names to label transform. Each output type must be either
    None or "log". Only for regression outputs.
  desc_transforms: dict
    dict mapping descriptor number to transform. Each transform must be
    either None, "log", "normalize", or "log-normalize"
  """
  dataset = load_and_transform_dataset(paths, task_transforms, desc_transforms,
      add_descriptors=add_descriptors)
  dataset = load_and_transform_dataset(paths, task_transforms)
  singletask = multitask_to_singletask(dataset)
  aucs, r2s, rms = {}, {}, {}
  sorted_targets = sorted(singletask.keys())
@@ -87,9 +82,8 @@ def fit_singletask_models(paths, modeltype, task_types, task_transforms,
    else:
      raise ValueError("Invalid model type provided.")
    model.fit(X_train, y_train.ravel())
    # TODO(rbharath): This breaks on regression datasets
    results = eval_model(test, model, {target: task_types[target]},
        desc_transforms, modeltype="sklearn", add_descriptors=add_descriptors)
        modeltype="sklearn")

    target_aucs = compute_roc_auc_scores(results, task_types)
    target_r2s = compute_r2_scores(results, task_types)
+8 −10
Original line number Diff line number Diff line
@@ -7,7 +7,6 @@ from deep_chem.models.deep import fit_singletask_mlp
from deep_chem.models.deep import fit_multitask_mlp
from deep_chem.models.standard import fit_singletask_models
from deep_chem.utils.load import get_default_task_types_and_transforms
from deep_chem.utils.preprocess import get_default_descriptor_transforms

def parse_args(input_args=None):
  """Parse command-line arguments."""
@@ -52,22 +51,21 @@ def main():
    paths[dataset] = path

  task_types, task_transforms = get_default_task_types_and_transforms(paths)
  desc_transforms = get_default_descriptor_transforms()

  if args.model == "singletask_deep_network":
    fit_singletask_mlp(paths.values(), task_types, task_transforms,
      desc_transforms, splittype=args.splittype, add_descriptors=False,
      n_hidden=args.n_hidden, learning_rate=args.learning_rate,
      dropout=args.dropout, nb_epoch=args.n_epochs, decay=args.decay,
      batch_size=args.batch_size,
      splittype=args.splittype, n_hidden=args.n_hidden,
      learning_rate=args.learning_rate, dropout=args.dropout,
      nb_epoch=args.n_epochs, decay=args.decay, batch_size=args.batch_size,
      validation_split=args.validation_split,
      weight_positives=args.weight_positives, num_to_train=args.num_to_train)
  elif args.model == "multitask_deep_network":
    fit_multitask_mlp(paths.values(), task_types, task_transforms,
      desc_transforms, splittype=args.splittype, add_descriptors=False,
      n_hidden=args.n_hidden, learning_rate = args.learning_rate, dropout = args.dropout,
      batch_size=args.batch_size, nb_epoch=args.n_epochs, decay=args.decay,
      validation_split=args.validation_split, weight_positives=args.weight_positives)
      splittype=args.splittype, n_hidden=args.n_hidden, learning_rate =
      args.learning_rate, dropout = args.dropout, batch_size=args.batch_size,
      nb_epoch=args.n_epochs, decay=args.decay,
      validation_split=args.validation_split,
      weight_positives=args.weight_positives)
  else:
    fit_singletask_models(paths.values(), args.model, task_types,
        task_transforms, splittype=args.splittype, num_to_train=args.num_to_train)
+9 −33
Original line number Diff line number Diff line
@@ -12,14 +12,14 @@ from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score

def model_predictions(test_set, model, n_targets, task_types, n_descriptors=0,
    add_descriptors=False, modeltype="sklearn"):
def model_predictions(test_set, model, n_targets, task_types,
    modeltype="sklearn"):
  """Obtains predictions of provided model on test_set.

  Returns a list of per-task predictions.

  TODO(rbharath): This function uses n_descriptors, n_targets instead of
  task_transforms, desc_transforms like everything else.
  TODO(rbharath): This function uses n_targets instead of
  task_transforms like everything else.

  Parameters
  ----------
@@ -32,23 +32,15 @@ def model_predictions(test_set, model, n_targets, task_types, n_descriptors=0,
  task_types: dict 
    dict mapping target names to output type. Each output type must be either
    "classification" or "regression".
  n_descriptors: int
    Number of output descriptors
  modeltype: string
    Either sklearn, keras, or keras_multitask
  add_descriptors: bool
    Add descriptor prediction as extra task.
  """
  # Extract features for test set and make preds
  X, _, _ = dataset_to_numpy(test_set)
  if add_descriptors:
    n_outputs = n_targets + n_descriptors
  else:
    n_outputs = n_targets
  if modeltype == "keras_multitask":
    predictions = model.predict({"input": X})
    ypreds = []
    for index in range(n_outputs):
    for index in range(n_targets):
      ypreds.append(predictions["task%d" % index])
  elif modeltype == "sklearn":
    # Must be single-task (breaking multitask RFs here)
@@ -66,16 +58,13 @@ def model_predictions(test_set, model, n_targets, task_types, n_descriptors=0,
    ypreds = [ypreds]
  return ypreds
  
def eval_model(test_set, model, task_types, desc_transforms={}, modeltype="sklearn",
    add_descriptors=False):
def eval_model(test_set, model, task_types, modeltype="sklearn"):
  """Evaluates the provided model on the test-set.

  Returns a dict which maps target-names to pairs of np.ndarrays (ytrue,
  yscore) of true labels vs. predict

  TODO(rbharath): This function is too complex. Refactor and simplify.
  TODO(rbharath): The handling of add_descriptors for semi-supervised learning
  is horrible. Refactor.

  Parameters
  ----------
@@ -86,27 +75,14 @@ def eval_model(test_set, model, task_types, desc_transforms={}, modeltype="sklea
  task_types: dict 
    dict mapping target names to output type. Each output type must be either
    "classification" or "regression".
  desc_transforms: dict
    dict mapping descriptor number to transform. Each transform must be
    either None, "log", "normalize", or "log-normalize"
  modeltype: string
    Either sklearn, keras, or keras_multitask
  add_descriptors: bool
    Add descriptor prediction as extra task.
  """
  sorted_targets = sorted(task_types.keys())
  if add_descriptors:
    sorted_descriptors = sorted(desc_transforms.keys())
    endpoints = sorted_targets + sorted_descriptors
    local_task_types = task_types.copy()
    for desc in desc_transforms:
      local_task_types[desc] = "regression"
  else:
  local_task_types = task_types.copy()
  endpoints = sorted_targets
  ypreds = model_predictions(test_set, model, len(sorted_targets),
      local_task_types, n_descriptors=len(desc_transforms),
      modeltype=modeltype, add_descriptors=add_descriptors)
      local_task_types, modeltype=modeltype)
  results = {}
  for target in endpoints:
    results[target] = ([], [])  # (ytrue, yscore)
+12 −33
Original line number Diff line number Diff line
@@ -197,9 +197,7 @@ def load_pdbbind_datasets(pdbbind_paths):
  return df

def load_vs_datasets(paths, target_dir_name="targets",
    fingerprint_dir_name="circular-scaffold-smiles",
    descriptor_dir_name="descriptors",
    add_descriptors=False):
    fingerprint_dir_name="circular-scaffold-smiles"):
  """Load both labels and fingerprints.

  Returns a dictionary that maps smiles to pairs of (fingerprint, labels)
@@ -213,20 +211,12 @@ def load_vs_datasets(paths, target_dir_name="targets",
  data = {}
  molecules = load_molecules(paths, fingerprint_dir_name)
  labels = load_assays(paths, target_dir_name)
  if add_descriptors:
    descriptors = load_descriptors(paths, descriptor_dir_name)
  # TODO(rbharath): Why are there fewer descriptors than labels at times?
  # What accounts for the descrepency. Please investigate.
  for ind, smiles in enumerate(molecules):
    if smiles not in labels or (add_descriptors and smiles not in descriptors):
    if smiles not in labels:
      continue
    mol = molecules[smiles]
    if add_descriptors:
      data[smiles] = {"fingerprint": mol["fingerprint"],
                      "scaffold": mol["scaffold"],
                      "labels": labels[smiles],
                      "descriptors": descriptors[smiles]}
    else:
    data[smiles] = {"fingerprint": mol["fingerprint"],
                    "scaffold": mol["scaffold"],
                    "labels": labels[smiles]}
@@ -243,11 +233,9 @@ def ensure_balanced(y, W):
      elif y[sample_ind, target_ind] == 1:
        pos_weight += W[sample_ind, target_ind]
    assert np.isclose(pos_weight, neg_weight)
  print "WEIGHTS ARE BALANCED"

def load_and_transform_dataset(paths, task_transforms, desc_transforms={},
    labels_endpoint="labels", descriptors_endpoint="descriptors",
    add_descriptors=False, weight_positives=True):
def load_and_transform_dataset(paths, task_transforms,
    labels_endpoint="labels", weight_positives=True):
  """Transform data labels as specified

  Parameters
@@ -255,19 +243,13 @@ def load_and_transform_dataset(paths, task_transforms, desc_transforms={},
  paths: list 
    List of paths to Google vs datasets. 
  task_transforms: dict 
    dict mapping target names to list of label transforms. Each list
    element must be "max-val", "log", "normalize". The transformations are
    performed in the order specified. An empty list
    corresponds to no transformations. Only for regression outputs.
  desc_transforms: dict
    dict mapping descriptor number to transform. Each transform must be
    either None, "log", "normalize", or "log-normalize"
  add_descriptors: bool
    Add descriptor prediction as extra task.
    dict mapping target names to list of label transforms. Each list element
    must be None, "log", "normalize", or "log-normalize". The transformations
    are performed in the order specified. An empty list corresponds to no
    transformations. Only for regression outputs.
  """
  dataset = load_datasets(paths, add_descriptors=add_descriptors)
  dataset = load_datasets(paths)
  X, y, W = transform_outputs(dataset, task_transforms,
      desc_transforms=desc_transforms, add_descriptors=add_descriptors,
      weight_positives=weight_positives)
  # TODO(rbharath): Take this out once test passes
  if weight_positives:
@@ -284,8 +266,5 @@ def load_and_transform_dataset(paths, task_transforms, desc_transforms={},
      else:
        labels[target] = y[s_index][t_index]
    datapoint[labels_endpoint] = labels
    if add_descriptors:
      # All non-target endpoints are descriptors
      datapoint[descriptors_endpoint] = y[s_index][len(sorted_targets):]
    trans_data[smiles] = datapoint 
  return trans_data
Loading