Commit 93ece013 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Large batch of changes for descriptors. Changes broke vanilla deep-model training.

parent 8e09e7a8
Loading
Loading
Loading
Loading
+24 −22
Original line number Diff line number Diff line
@@ -21,8 +21,8 @@ from deep_chem.utils.evaluate import compute_rms_scores
from deep_chem.utils.evaluate import compute_roc_auc_scores
from deep_chem.utils.load import load_and_transform_dataset

def process_multitask(paths, task_transforms, splittype="random",
    seed=None, weight_positives=False):
def process_multitask(paths, input_transforms, output_transforms, feature_types,
    splittype="random", seed=None, weight_positives=False):
  """Extracts multitask datasets and splits into train/test.

  Returns a tuple of test/train datasets, fingerprints, and labels.
@@ -34,7 +34,7 @@ def process_multitask(paths, task_transforms, splittype="random",
  ----------
  paths: list 
    List of paths to datasets. 
  task_transforms: dict 
  output_transforms: dict 
    dict mapping target names to label transform. Each output type must be either
    None, "log", "normalize" or "log-normalize". Only for regression outputs.
  splittype: string
@@ -42,8 +42,8 @@ def process_multitask(paths, task_transforms, splittype="random",
  seed: int
    Seed used for random splits.
  """
  dataset = load_and_transform_dataset(paths, task_transforms,
			prediction_endpoint,
  dataset = load_and_transform_dataset(paths, input_transforms, output_transforms,
			prediction_endpoint, feature_types=feature_types,
      weight_positives=weight_positives)
  sorted_targets = sorted(dataset.keys())
  if splittype == "random":
@@ -64,8 +64,8 @@ def process_multitask(paths, task_transforms, splittype="random",
  #  ensure_balanced(y_test, W_test)
  return (train, X_train, y_train, W_train, test, X_test, y_test, W_test)

def process_singletask(paths, task_transforms,
		prediction_endpoint,
def process_singletask(paths, input_transforms, output_transforms,
		prediction_endpoint, feature_types,
		splittype="random", seed=None,
    weight_positives=True):
  """Extracts singletask datasets and splits into train/test.
@@ -76,7 +76,7 @@ def process_singletask(paths, task_transforms,
  ----------
  paths: list 
    List of paths to Google vs datasets. 
  task_transforms: dict 
  output_transforms: dict 
    dict mapping target names to label transform. Each output type must be either
    None or "log". Only for regression outputs.
  splittype: string
@@ -84,8 +84,8 @@ def process_singletask(paths, task_transforms,
  seed: int
    Seed used for random splits.
  """
  dataset = load_and_transform_dataset(paths, task_transforms,
			prediction_endpoint,
  dataset = load_and_transform_dataset(paths, input_transforms, output_transforms,
			prediction_endpoint, feature_types=feature_types,
      weight_positives=weight_positives)
  singletask = multitask_to_singletask(dataset)
  arrays = {}
@@ -105,8 +105,9 @@ def process_singletask(paths, task_transforms,
  return arrays


def fit_multitask_mlp(paths, task_types, task_transforms, prediction_endpoint,
                      splittype="random", weight_positives=False, **training_params):
def fit_multitask_mlp(paths, task_types, input_transforms, output_transforms,
                      prediction_endpoint, feature_types, splittype="random",
                      weight_positives=False, **training_params):
  """
  Perform stochastic gradient descent optimization for a keras multitask MLP.
  Returns AUCs, R^2 scores, and RMS values.
@@ -118,14 +119,14 @@ def fit_multitask_mlp(paths, task_types, task_transforms, prediction_endpoint,
  task_types: dict 
    dict mapping target names to output type. Each output type must be either
    "classification" or "regression".
  task_transforms: dict 
  output_transforms: dict 
    dict mapping target names to label transform. Each output type must be either
    None, "log", "normalize", or "log-normalize". Only for regression outputs.
  training_params: dict
    Aggregates keyword parameters to pass to train_multitask_model
  """
  (train, X_train, y_train, W_train), (test, X_test, y_test, W_test) = (
      process_multitask(paths, task_transforms, splittype=splittype,
      process_multitask(paths, input_transforms, output_transforms, feature_types, splittype=splittype,
                        weight_positives=weight_positives))
  print np.shape(y_train)
  model = train_multitask_model(X_train, y_train, W_train, task_types,
@@ -140,8 +141,9 @@ def fit_multitask_mlp(paths, task_types, task_transforms, prediction_endpoint,
  if r2s:
    print "Mean R^2: %f" % np.mean(np.array(r2s.values()))

def fit_singletask_mlp(paths, task_types, task_transforms,
def fit_singletask_mlp(paths, task_types, input_transforms, output_transforms,
											 prediction_endpoint,
                       feature_types,
                       splittype="random", weight_positives=True,
                       num_to_train=None, **training_params):
  """
@@ -152,14 +154,14 @@ def fit_singletask_mlp(paths, task_types, task_transforms,
  task_types: dict 
    dict mapping target names to output type. Each output type must be either
    "classification" or "regression".
  task_transforms: dict 
  output_transforms: dict 
    dict mapping target names to label transform. Each output type must be either
    None or "log". Only for regression outputs.
  training_params: dict
    Aggregates keyword parameters to pass to train_multitask_model
  """
  singletasks = process_singletask(paths, task_transforms,
		prediction_endpoint,
  singletasks = process_singletask(paths, input_transforms, output_transforms,
		prediction_endpoint, feature_types,
    splittype=splittype, weight_positives=weight_positives)
  ret_vals = {}
  aucs, r2s, rms = {}, {}, {}
@@ -196,7 +198,7 @@ def fit_singletask_mlp(paths, task_types, task_transforms,

def train_multitask_model(X, y, W, task_types,
  learning_rate=0.01, decay=1e-6, momentum=0.9, nesterov=True, activation="relu",
  dropout=0.5, nb_epoch=20, batch_size=50, n_hidden=500, n_input=1024,
  dropout=0.5, nb_epoch=20, batch_size=50, n_hidden=500, n_inputs=1024,
  validation_split=0.1):
  """
  Perform stochastic gradient descent optimization for a keras multitask MLP.
@@ -232,9 +234,9 @@ def train_multitask_model(X, y, W, task_types,
  # Add eps weight to avoid minibatches with zero weight (causes theano to crash).
  W = W + eps * np.ones(np.shape(W))
  model = Graph()
  model.add_input(name="input", ndim=n_input)
  model.add_input(name="input", ndim=n_inputs)
  model.add_node(
      Dense(n_input, n_hidden, init='uniform', activation=activation),
      Dense(n_inputs, n_hidden, init='uniform', activation=activation),
      name="dense", input="input")
  model.add_node(Dropout(dropout), name="dropout", input="dense")
  top_layer = "dropout"
+9 −7
Original line number Diff line number Diff line
@@ -17,7 +17,8 @@ from deep_chem.utils.evaluate import compute_r2_scores
# code in deep.py
# TODO(rbharath): paths is to handle sharded input pickle files. Might be
# better to use hdf5 datasets like in MSMBuilder
def process_3D_convolutions(paths, task_transforms, prediction_endpoint, seed=None, splittype="random"):
def process_3D_convolutions(paths, input_transforms, output_transforms, prediction_endpoint,
                            feature_types, seed=None, splittype="random"):
  """Loads 3D Convolution datasets.

  Parameters
@@ -25,8 +26,8 @@ def process_3D_convolutions(paths, task_transforms, prediction_endpoint, seed=No
  paths: list
    List of paths to convolution datasets.
  """
  dataset = load_and_transform_dataset(paths, task_transforms,
    prediction_endpoint, datatype="pdbbind")
  dataset = load_and_transform_dataset(paths, input_transforms, output_transforms,
    prediction_endpoint, feature_types=feature_types, datatype="pdbbind")
  # TODO(rbharath): Factor this code splitting out into a util function.
  if splittype == "random":
    train, test = train_test_random_split(dataset, seed=seed)
@@ -36,13 +37,14 @@ def process_3D_convolutions(paths, task_transforms, prediction_endpoint, seed=No
  X_test, y_test, W_test = tensor_dataset_to_numpy(test)
  return (X_train, y_train, W_train, train), (X_test, y_test, W_test, test)

def fit_3D_convolution(paths, task_types, task_transforms, prediction_endpoint,
    axis_length=32, **training_params):
def fit_3D_convolution(paths, task_types, input_transforms, output_transforms, prediction_endpoint,
    feature_types, axis_length=32, **training_params):
  """
  Perform stochastic gradient descent for a 3D CNN.
  """
  (X_train, y_train, W_train, train), (X_test, y_test, W_test, test) = process_3D_convolutions(
    paths, task_transforms, prediction_endpoint)
  (X_train, y_train, W_train, train), (X_test, y_test, W_test, test) = (
      process_3D_convolutions(paths, input_transforms, output_transforms, prediction_endpoint,
                              feature_types))

  print "np.shape(X_train): " + str(np.shape(X_train))
  print "np.shape(y_train): " + str(np.shape(y_train))
+50 −14
Original line number Diff line number Diff line
@@ -7,17 +7,33 @@ from deep_chem.models.deep import fit_singletask_mlp
from deep_chem.models.deep import fit_multitask_mlp
from deep_chem.models.deep3d import fit_3D_convolution
from deep_chem.models.standard import fit_singletask_models
from deep_chem.utils.load import get_default_task_types_and_transforms
from deep_chem.utils.load import get_target_names

# TODO(rbharath): Factor this into subcommands. The interface is too
# complicated now to effectively use.
def parse_args(input_args=None):
  """Parse command-line arguments."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--datasets', nargs="+", required=1,
                      choices=['muv', 'pcba', 'dude', 'pfizer', 'globavir', 'pdbbind'],
                      help='Name of dataset to process.')
  #parser.add_argument("--datasets", nargs="+", required=1,
  #                    choices=["muv", "pcba", "dude", "pfizer", "globavir", "pdbbind"],
  #                    help="Name of dataset to process.")
  #parser.add_argument("--dataset-names", nargs="+", required=1,
  #                    help="Names of datasets to process.")
  parser.add_argument("--task-type", default="classification",
                      choices=["classification", "regression"],
                      help="Type of learning task.")
  parser.add_argument("--input-transforms", nargs="+", default=[],
                      choices=["normalize", "truncate-outliers"],
                      help="Transforms to apply to input data.")
  parser.add_argument("--output-transforms", nargs="+", default=[],
                      choices=["log", "normalize"],
                      help="Transforms to apply to output data.")
  parser.add_argument("--feature-types", nargs="+", required=1,
                      choices=["fingerprints", "descriptors", "grid"],
                      help="Types of featurizations to use.")
  parser.add_argument("--paths", nargs="+", required=1,
                      help = "Paths to input datasets.")
  parser.add_argument('--model', required=1,
  parser.add_argument("--model", required=1,
                      choices=["logistic", "rf_classifier", "rf_regressor",
                      "linear", "ridge", "lasso", "lasso_lars", "elastic_net",
                      "singletask_deep_network", "multitask_deep_network",
@@ -25,8 +41,12 @@ def parse_args(input_args=None):
  parser.add_argument("--splittype", type=str, default="scaffold",
                       choices=["scaffold", "random"],
                       help="Type of cross-validation data-splitting.")
  parser.add_argument("--prediction-endpoint", type=str, default="IC50",
  parser.add_argument("--prediction-endpoint", type=str, required=1,
                       help="Name of measured endpoint to predict.")
  # TODO(rbharath): There should be a way to directly compute n-input from the
  # provided feature-types?
  parser.add_argument("--n-inputs", type=int, default=1024,
                      help="Number of input features for models.")
  parser.add_argument("--n-hidden", type=int, default=500,
                      help="Number of hidden neurons for NN models.")
  parser.add_argument("--learning-rate", type=float, default=0.01,
@@ -55,37 +75,53 @@ def main():
  args = parse_args()
  paths = {}

  for dataset, path in zip(args.datasets, args.paths):
    paths[dataset] = path
  paths = args.paths

  task_types, task_transforms = get_default_task_types_and_transforms(paths)
  print paths
  targets = get_target_names(paths)
  task_types = {target: args.task_type for target in targets}
  input_transforms = args.input_transforms 
  print "input_transforms"
  print input_transforms
  output_transforms = {target: args.output_transforms for target in targets}

  # TODO(rbharath): Too many settings are explicitly passed down here. Is there
  # a good way to pass down configuration parameters into the invocations
  # below.
  if args.model == "singletask_deep_network":
    fit_singletask_mlp(paths.values(), task_types, task_transforms,
    fit_singletask_mlp(paths, task_types, input_transforms, output_transforms,
      prediction_endpoint=args.prediction_endpoint,
      feature_types=args.feature_types,
      splittype=args.splittype, 
      n_inputs=args.n_inputs,
      n_hidden=args.n_hidden,
      learning_rate=args.learning_rate, dropout=args.dropout,
      nb_epoch=args.n_epochs, decay=args.decay, batch_size=args.batch_size,
      validation_split=args.validation_split,
      weight_positives=args.weight_positives, num_to_train=args.num_to_train)
  elif args.model == "multitask_deep_network":
    fit_multitask_mlp(paths.values(), task_types, task_transforms,
    fit_multitask_mlp(paths, task_types, input_transforms, output_transforms,
      prediction_endpoint=args.prediction_endpoint,
      feature_types=args.feature_types,
      splittype=args.splittype,
      n_inputs=args.n_inputs,
      n_hidden=args.n_hidden, learning_rate =
      args.learning_rate, dropout = args.dropout, batch_size=args.batch_size,
      nb_epoch=args.n_epochs, decay=args.decay,
      validation_split=args.validation_split,
      weight_positives=args.weight_positives)
  elif args.model == "3D_cnn":
    fit_3D_convolution(paths.values(), task_types, task_transforms,
    fit_3D_convolution(paths, task_types, input_transforms, output_transforms,
        feature_types=args.feature_types,
        prediction_endpoint=args.prediction_endpoint,
        axis_length=args.axis_length, nb_epoch=args.n_epochs,
        batch_size=args.batch_size)
  else:
    fit_singletask_models(paths.values(), args.model, task_types,
        task_transforms, splittype=args.splittype, num_to_train=args.num_to_train)
    print args.prediction_endpoint
    fit_singletask_models(paths, args.model, task_types,
        input_transforms, output_transforms, prediction_endpoint=args.prediction_endpoint,
        feature_types=args.feature_types, splittype=args.splittype,
        num_to_train=args.num_to_train)

if __name__ == "__main__":
  main()
+2 −2
Original line number Diff line number Diff line
# Usage ./process_bace.sh INPUT_SDF_FILE
python -m deep_chem.scripts.process_dataset --input-file $1 --input-type sdf --fields Name smiles pIC50 --field-types string string concentration --name BACE --out /tmp/
# Usage ./process_bace.sh INPUT_SDF_FILE OUT_DIR
python -m deep_chem.scripts.process_dataset --input-file $1 --input-type sdf --fields Name smiles pIC50 --field-types string string concentration --name BACE --out $2
+7 −10
Original line number Diff line number Diff line
@@ -43,6 +43,9 @@ def generate_directories(name, out):
  fingerprint_dir = os.path.join(dataset_dir, "fingerprints")
  if not os.path.exists(fingerprint_dir):
    os.makedirs(fingerprint_dir)
  descriptor_dir = os.path.join(dataset_dir, "descriptors")
  if not os.path.exists(descriptor_dir):
    os.makedirs(descriptor_dir)
  target_dir = os.path.join(dataset_dir, "targets")
  if not os.path.exists(target_dir):
    os.makedirs(target_dir)
@@ -87,20 +90,13 @@ def generate_descriptors(name, out):
  fingerprint_dir = os.path.join(dataset_dir, "descriptors")
  shards_dir = os.path.join(dataset_dir, "shards")
  sdf = os.path.join(shards_dir, "%s-0.sdf.gz" % name)
  fingerprints = os.path.join(fingerprint_dir,
      "%s-fingerprints.pkl.gz" % name)
  descriptors = os.path.join(fingerprint_dir,
      "%s-descriptors.pkl.gz" % name)
  subprocess.call(["python", "-m", "vs_utils.scripts.featurize",
                   "--scaffolds", "--smiles",
                   sdf, fingerprints,
                   sdf, descriptors,
                   "descriptors"])

def globavir_specs():
  fields = ["compound_name", "isomeric_smiles", "tdo_ic50_nm", "tdo_Ki_nm",
    "tdo_percent_activity_10_um", "tdo_percent_activity_1_um", "ido_ic50_nm",
    "ido_Ki_nm", "ido_percent_activity_10_um", "ido_percent_activity_1_um"]
  field_types = ["string", "string", "float", "float", "float", "float",
      "float", "float", "float", "float"]

def get_rows(input_file, input_type):
  """Returns an iterator over all rows in input_file"""
  # TODO(rbharath): This function loads into memory, which can be painful. The
@@ -217,6 +213,7 @@ def main():
  generate_targets(args.input_file, args.input_type, args.fields,
      args.field_types, out_pkl, out_sdf)
  generate_fingerprints(args.name, args.out)
  generate_descriptors(args.name, args.out)


if __name__ == "__main__":
Loading