Commit 38164b64 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Narrowing down bug to something wrong in modeler eval.

parent c422e15f
Loading
Loading
Loading
Loading
+27 −21
Original line number Diff line number Diff line
@@ -2,13 +2,14 @@
Top level script to featurize input, train models, and evaluate them.
"""
import argparse
import gzip
import numpy as np
import cPickle as pickle
from deep_chem.utils.featurize import generate_directories
from deep_chem.utils.featurize import extract_data
from deep_chem.utils.featurize import generate_targets
from deep_chem.utils.featurize import generate_features
from deep_chem.utils.featurize import generate_fingerprints
from deep_chem.utils.featurize import generate_descriptors
from deep_chem.utils.featurize import generate_vs_utils_features
from deep_chem.models.deep import fit_singletask_mlp
from deep_chem.models.deep import fit_multitask_mlp
from deep_chem.models.deep3d import fit_3D_convolution
@@ -84,24 +85,27 @@ def parse_args(input_args=None):
                            "specified requires that split be in original data.")
  transform_cmd.add_argument("--weight-positives", type=bool, default=False,
                  help="Weight positive examples to have same total weight as negatives.")
  transform_cmd.add_argument("--mode", default="singletask",
                      choices=["singletask", "multitask"],
                      help="Type of model being built.")
  transform_cmd.add_argument("--out", type=str, required=1,
                     help="Location to save transformed mode.")
  transform_cmd.set_defaults(func=transform_input)

  # TRAIN FLAGS
  train_cmd = subparsers.add_parser("train",
                  help="Train a model on specified data.")
                  help="Train a model on train data processed by transform.")
  group = train_cmd.add_argument_group("load-and-transform")
  group.add_argument("--task-type", default="classification",
                      choices=["classification", "regression"],
                      help="Type of learning task.")
  group.add_argument("--in", required=1,
  group.add_argument("--saved-data", required=1,
                     help="Location of saved transformed data.")
  # TODO(rbharath): CODE SMELL. This shouldn't be shuttled around
  group.add_argument("--paths", nargs="+", required=1,
                      help="Paths to input datasets.")

  group = train_cmd.add_argument_group("model")
  group.add_argument("--mode", default="singletask",
                      choices=["singletask", "multitask"],
                      help="Type of model being built.")
  group.add_argument("--model", required=1,
                      choices=["logistic", "rf_classifier", "rf_regressor",
                      "linear", "ridge", "lasso", "lasso_lars", "elastic_net",
@@ -132,19 +136,18 @@ def parse_args(input_args=None):
  train_cmd.set_defaults(func=train_model)

  eval_cmd = subparsers.add_parser("eval",
                help="Evaluate trained model on specified data.")
                help="Evaluate trained model on test data processed by transform.")
  group = eval_cmd.add_argument_group("load model/data")
  group.add_argument("--saved-model", type=str, required=1,
                  help="Location from which to load saved model.")
  group.add_argument("--in", required=1,
  group.add_argument("--saved-data", required=1,
                     help="Location of saved transformed data.")
  # TODO(rbharath): CODE SMELL. This shouldn't be shuttled around
  group.add_argument("--paths", nargs="+", required=1,
                      help="Paths to input datasets.")
  group.add_argument("--modeltype", required=1,
                      choices=["sklearn", "keras"],
                      help="Type of model to load.")
  # TODO(rbharath): Is there a way to get rid of this guy?
  group.add_argument("--mode", default="singletask",
                      choices=["singletask", "multitask"],
                      help="Type of model being built.")
  # TODO(rbharath): This argument seems a bit extraneous. Is it really
  # necessary?
  group.add_argument("--task-type", default="classification",
@@ -167,37 +170,41 @@ def featurize_input(args):
  """Featurizes raw input data."""
  if len(args.fields) != len(args.field_types):
    raise ValueError("number of fields does not equal number of field types")
  out_x_pkl, out_y_pkl, out_sdf = generate_directories(args.name, args.out, 
  out_x_pkl, out_y_pkl = generate_directories(args.name, args.out, 
      args.feature_endpoints)
  df, mols = extract_data(args.input_file, args.input_type, args.fields,
      args.field_types, args.prediction_endpoint, args.smiles_endpoint,
      args.threshold, args.delimiter)
  print "Generating targets"
  generate_targets(df, mols, args.prediction_endpoint, args.split_endpoint,
      args.smiles_endpoint, args.id_endpoint, out_y_pkl, out_sdf)
      args.smiles_endpoint, args.id_endpoint, out_y_pkl)
  print "Generating user-specified features"
  generate_features(df, args.feature_endpoints, args.smiles_endpoint,
                    args.id_endpoint, out_x_pkl)
  print "Generating circular fingerprints"
  generate_vs_utils_features(df, args.name, args.out, args.smiles_endpoint,
      args.id_endpoint, "fingerprints")
  print "Generating rdkit descriptors"
  generate_vs_utils_features(df, args.name, args.out, args.smiles_endpoint,
      args.id_endpoint, "descriptors")
  generate_descriptors(df, args.name, args.out, args.smiles_endpoint, args.id_endpoint)

def transform_input(args):
  """Saves transformed model."""
  targets = get_target_names(args.paths)
  output_transforms = {target: args.output_transforms for target in targets}
  per_task_data = process_datasets(args.paths,
      args.input_transforms, output_transforms, feature_types=args.feature_types, 
      splittype=args.splittype, weight_positives=args.weight_positives,
      mode=args.mode)
  with gzip.open(args.out) as f:
  with gzip.open(args.out, "wb") as f:
    pickle.dump(per_task_data, f)

def train_model(args):
  """Builds model from featurized data."""
  targets = get_target_names(args.paths)
  task_types = {target: args.task_type for target in targets}
  output_transforms = {target: args.output_transforms for target in targets}

  with gzip.open(args.in) as f:
  with gzip.open(args.saved_data) as f:
    per_task_data = pickle.load(f)

  if args.model == "singletask_deep_network":
@@ -227,10 +234,9 @@ def eval_trained_model(args):
  targets = get_target_names(args.paths)
  task_types = {target: args.task_type for target in targets}

  with gzip.open(args.in) as f:
  with gzip.open(args.saved_data) as f:
    per_task_data = pickle.load(f)

  output_transforms = {target: args.output_transforms for target in targets}
  results, aucs, r2s, rms = compute_model_performance(per_task_data, task_types, model, args.modeltype,
    args.compute_aucs, args.compute_r2s, args.compute_rms) 
  if args.csv_out is not None:
+0 −1
Original line number Diff line number Diff line
@@ -97,7 +97,6 @@ def model_predictions(test_set, model, n_targets, task_types,
    ypreds = [ypreds]
  return ypreds

  
def eval_model(test_set, model, task_types, modeltype="sklearn"):
  """Evaluates the provided model on the test-set.

+9 −17
Original line number Diff line number Diff line
@@ -31,20 +31,18 @@ def generate_directories(name, out, feature_endpoints):
  target_dir = os.path.join(dataset_dir, "targets")
  if not os.path.exists(target_dir):
    os.makedirs(target_dir)
  shards_dir = os.path.join(dataset_dir, "shards")
  if not os.path.exists(shards_dir):
    os.makedirs(shards_dir)
  if feature_endpoints is not None:
    feature_endpoint_dir = os.path.join(dataset_dir, "features")
    if not os.path.exists(feature_endpoint_dir):
      os.makedirs(feature_endpoint_dir)

  # Return names of files to be generated
  # TODO(rbharath): Explicitly passing around out_*_pkl is an encapsulation
  # failure. Remove this.
  out_y_pkl = os.path.join(target_dir, "%s.pkl.gz" % name)
  out_sdf = os.path.join(shards_dir, "%s-0.sdf.gz" % name)
  out_x_pkl = (os.path.join(feature_endpoint_dir, "%s.pkl.gz" %name)
  out_x_pkl = (os.path.join(feature_endpoint_dir, "%s-features.pkl.gz" %name)
      if feature_endpoints is not None else None)
  return out_x_pkl, out_y_pkl, out_sdf
  return out_x_pkl, out_y_pkl

def parse_float_input(val):
  """Safely parses a float input."""
@@ -59,12 +57,12 @@ def parse_float_input(val):
    if ">" in val or "<" in val or "-" in val:
      return np.nan

def generate_vs_util_features(df, name, out, smiles_endpoint, id_endpoint, featuretype):
def generate_vs_utils_features(df, name, out, smiles_endpoint, id_endpoint, featuretype):
  """Generates circular fingerprints for dataset."""
  dataset_dir = os.path.join(out, name)
  feature_dir = os.path.join(dataset_dir, "fingerprints")
  feature_dir = os.path.join(dataset_dir, featuretype)
  features = os.path.join(feature_dir,
      "%s-%s.pkl.gz" % (name, featuretype)
      "%s-%s.pkl.gz" % (name, featuretype))

  feature_df = pd.DataFrame([]) 
  feature_df["smiles"] = df[[smiles_endpoint]]
@@ -80,7 +78,7 @@ def generate_vs_util_features(df, name, out, smiles_endpoint, id_endpoint, featu
    mols.append(Chem.MolFromSmiles(smiles))
  if featuretype == "fingerprints":
    featurizer = CircularFingerprint(size=1024)
  elif featurizer == "descriptors":
  elif featuretype == "descriptors":
    featurizer = SimpleDescriptors()
  else:
    raise ValueError("Unsupported featuretype requested.")
@@ -171,7 +169,7 @@ def process_field(data, field_type):
    return data 

def generate_targets(df, mols, prediction_endpoint, split_endpoint,
    smiles_endpoint, id_endpoint, out_pkl, out_sdf):
    smiles_endpoint, id_endpoint, out_pkl):
  """Process input data file, generate labels, i.e. y"""
  #TODO(enf, rbharath): Modify package unique identifier to take user-specified 
    #unique identifier instead of assuming smiles string
@@ -185,12 +183,6 @@ def generate_targets(df, mols, prediction_endpoint, split_endpoint,
  # Write pkl.gz file
  with gzip.open(out_pkl, "wb") as f:
    pickle.dump(labels_df, f, pickle.HIGHEST_PROTOCOL)
  # Write sdf.gz file
  with gzip.open(out_sdf, "wb") as gz:
    w = Chem.SDWriter(gz)
    for mol in mols:
      w.write(mol)
    w.close()

def generate_scaffold(smiles_elt, include_chirality=False, smiles_endpoint="smiles"):
  smiles_string = smiles_elt[smiles_endpoint]
+3 −6
Original line number Diff line number Diff line
@@ -36,6 +36,7 @@ def process_datasets(paths, input_transforms, output_transforms,
  seed: int
    Seed used for random splits.
  """
  print "process_datasets()"
  dataset = load_and_transform_dataset(paths, input_transforms, output_transforms,
      feature_types=feature_types, weight_positives=weight_positives)
  arrays = {}
@@ -54,6 +55,8 @@ def process_datasets(paths, input_transforms, output_transforms,
    arrays["all"] = (train_data, test_data)
  else:
    raise ValueError("Unsupported mode for process_datasets.")
  print "np.shape(arrays['CANVAS-BACE'][0][1])"
  print np.shape(arrays['CANVAS-BACE'][0][1])
  return arrays


@@ -69,21 +72,15 @@ def load_molecules(paths, feature_types=["fingerprints"]):
    List of strings.
  """
  molecules = {}
  print "load_molecules()"
  for dataset_path in paths:
    for feature_type in feature_types:
      print "feature_type: %s" % feature_type
      pickle_dir = os.path.join(dataset_path, feature_type)
      print "pickle_dir: %s" % pickle_dir
      pickle_files = os.listdir(pickle_dir)
      if len(pickle_files) == 0:
        raise ValueError("No Pickle Files found to load molecules")
      for pickle_file in pickle_files:
        print "loading pickle_file: %s" % pickle_file
        with gzip.open(os.path.join(pickle_dir, pickle_file), "rb") as f:
          contents = pickle.load(f)
          print "contents.keys()"
          print contents.keys()
          smiles, features, scaffolds, mol_ids = (
              contents["smiles"], contents["features"],
              contents["scaffolds"], contents["mol_id"])