Commit c7939b38 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

model command now builds basic models.

parent dc6abe20
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -35,7 +35,7 @@ def fit_singletask_models(train_data, modeltype, task_types):
  """
  models = {}
  for index, target in enumerate(sorted(train_data.keys())):
    print "Building model %d" % index
    print "Building model for target %s" % target 
    (_, X_train, y_train, W_train) = train_data[target]
    if modeltype == "rf_regressor":
      model = RandomForestRegressor(n_estimators=500, n_jobs=-1,
+121 −70
Original line number Diff line number Diff line
@@ -5,6 +5,7 @@ import argparse
import gzip
import numpy as np
import cPickle as pickle
import os
from deep_chem.utils.featurize import generate_directories
from deep_chem.utils.featurize import extract_data
from deep_chem.utils.featurize import generate_targets
@@ -22,39 +23,43 @@ from deep_chem.utils.evaluate import compute_model_performance
def add_featurization_command(subparsers):
  featurize_cmd = subparsers.add_parser("featurize",
      help="Featurize raw input data.")
  featurize_cmd.add_argument("--input-file", required=1,
  add_featurize_group(featurize_cmd)

def add_featurize_group(featurize_cmd):
  featurize_group = featurize_cmd.add_argument_group("Input Specifications")
  featurize_group.add_argument("--input-file", required=1,
      help="Input file with data.")
  featurize_cmd.add_argument("--input-type", default="csv",
  featurize_group.add_argument("--input-type", default="csv",
      choices=["xlsx", "csv", "pandas", "sdf"],
      help="Type of input file. If pandas, input must be a pkl.gz\n"
           "containing a pandas dataframe. If sdf, should be in\n"
           "(perhaps gzipped) sdf file.")
  featurize_cmd.add_argument("--delimiter", default=",",
  featurize_group.add_argument("--delimiter", default=",",
      help="If csv input, delimiter to use for read csv file")
  featurize_cmd.add_argument("--fields", required=1, nargs="+",
  featurize_group.add_argument("--fields", required=1, nargs="+",
      help = "Names of fields.")
  featurize_cmd.add_argument("--field-types", required=1, nargs="+",
  featurize_group.add_argument("--field-types", required=1, nargs="+",
      choices=["string", "float", "list-string", "list-float", "ndarray"],
      help="Type of data in fields.")
  featurize_cmd.add_argument("--feature-endpoints", type=str, nargs="+",
      help="Optional endpoint that holds pre-computed feature vector")
  featurize_cmd.add_argument("--prediction-endpoint", type=str, required=1,
      help="Name of measured endpoint to predict.")
  featurize_cmd.add_argument("--split-endpoint", type=str, default=None,
      help="Name of endpoint specifying train/test split.")
  featurize_cmd.add_argument("--smiles-endpoint", type=str, default="smiles",
      help="Name of endpoint specifying SMILES for molecule.")
  featurize_cmd.add_argument("--id-endpoint", type=str, default=None,
      help="Name of endpoint specifying unique identifier for molecule.\n"
           "If none is specified, then smiles-endpoint is used as identifier.")
  featurize_group.add_argument("--feature-fields", type=str, nargs="+",
      help="Optional field that holds pre-computed feature vector")
  featurize_group.add_argument("--prediction-field", type=str, required=1,
      help="Name of measured field to predict.")
  featurize_group.add_argument("--split-field", type=str, default=None,
      help="Name of field specifying train/test split.")
  featurize_group.add_argument("--smiles-field", type=str, default="smiles",
      help="Name of field specifying SMILES for molecule.")
  featurize_group.add_argument("--id-field", type=str, default=None,
      help="Name of field specifying unique identifier for molecule.\n"
           "If none is specified, then smiles-field is used as identifier.")
  # TODO(rbharath): This should be moved to train-tests-split
  featurize_cmd.add_argument("--threshold", type=float, default=None,
      help="If specified, will be used to binarize real-valued prediction-endpoint.")
  featurize_cmd.add_argument("--name", required=1,
  featurize_group.add_argument("--threshold", type=float, default=None,
      help="If specified, will be used to binarize real-valued prediction-field.")
  featurize_group.add_argument("--name", required=1,
      help="Name of the dataset.")
  featurize_cmd.add_argument("--out", required=1,
  featurize_group.add_argument("--out", required=1,
      help="Folder to generate processed dataset in.")
  featurize_cmd.set_defaults(func=featurize_input)
  featurize_group.set_defaults(func=featurize_input)

def add_train_test_command(subparsers):
  train_test_cmd = subparsers.add_parser("train-test-split",
@@ -86,7 +91,7 @@ def add_train_test_command(subparsers):
      help="Location to save test set.")
  train_test_cmd.set_defaults(func=train_test_input)

def add_model_arguments(fit_cmd):
def add_model_group(fit_cmd):
  group = fit_cmd.add_argument_group("model")
  group.add_argument("--model", required=1,
      choices=["logistic", "rf_classifier", "rf_regressor",
@@ -127,7 +132,7 @@ def add_fit_command(subparsers):
  group.add_argument("--paths", nargs="+", required=1,
                      help="Paths to input datasets.")

  add_model_arguments(fit_cmd)
  add_model_group(fit_cmd)
  group = fit_cmd.add_argument_group("save")
  group.add_argument("--saved-out", type=str, required=1,
      help="Location to save trained model.")
@@ -179,36 +184,12 @@ def add_model_command(subparsers):
  model_cmd = subparsers.add_parser("model",
      help="Combines featurize, train-test-split, fit, eval into one command\n"
           "for user convenience.")
  featurize_group = model_cmd.add_argument_group("featurize")
  featurize_group.add_argument("--input-file", required=1,
      help="Input file with data.")
  featurize_group.add_argument("--input-type", default="csv",
      choices=["xlsx", "csv", "pandas", "sdf"],
      help="Type of input file. If pandas, input must be a pkl.gz\n"
           "containing a pandas dataframe. If sdf, should be in\n"
           "(perhaps gzipped) sdf file.")
  featurize_group.add_argument("--fields", required=1, nargs="+",
      help = "Names of fields. Fields correspond to columns in csv files,\n"
             "and to molecular property names for SDF files.")
  featurize_group.add_argument("--field-types", required=1, nargs="+",
      choices=["string", "float", "list-string", "list-float", "ndarray"],
      help="Type of data in fields.")
  featurize_group.add_argument("--feature-fields", type=str, nargs="+",
      help="Optional endpoint that holds pre-computed feature vector")
  featurize_group.add_argument("--prediction-field", type=str, required=1,
      help="Name of measured endpoint to predict.")
  featurize_group.add_argument("--split-field", type=str, default=None,
      help="Name of endpoint specifying train/test split.")
  featurize_group.add_argument("--smiles-field", type=str, default="smiles",
      help="Name of endpoint specifying SMILES for molecule.")
  featurize_group.add_argument("--id-field", type=str, default=None,
      help="Name of endpoint specifying unique identifier for molecule.\n"
           "If none is specified, then smiles-endpoint is used as identifier.")
  featurize_group.add_argument("--feature-types", nargs="+", required=1,
  model_cmd.add_argument("--skip-featurization", action="store_true",
      help="If set, skip the featurization step.")
  model_cmd.add_argument("--feature-types", nargs="+", required=1,
      help="Types of featurizations to use. Each featurization must correspond\n"
           "to subdirectory in generated data directory.")
  featurize_group.add_argument("--out", required=1,
      help="Folder to generate processed dataset in.")
  add_featurize_group(model_cmd)

  train_test_group = model_cmd.add_argument_group("train_test_group")
  train_test_group.add_argument("--input-transforms", nargs="+", default=[],
@@ -225,12 +206,66 @@ def add_model_command(subparsers):
      help="Type of train/test data-splitting. 'scaffold' uses Bemis-Murcko scaffolds.\n"
           "specified requires that split be in original data.")

  add_model_arguments(model_cmd)
  add_model_group(model_cmd)
  model_cmd.add_argument("--task-type", default="classification",
      choices=["classification", "regression"],
      help="Type of learning task.")
  model_cmd.add_argument("--csv-out", type=str, default=None,
      help="Outputted predictions on the test set.")
  model_cmd.set_defaults(func=create_model)

def create_model(args):
  """Creates a model"""
  data_dir = os.path.join(args.out, args.name)
  print "+++++++++++++++++++++++++++++++++"
  print "Perform featurization"
  if not args.skip_featurization:
    _featurize_input(args.name, args.out, args.input_file, args.input_type,
        args.fields, args.field_types, args.feature_fields, args.prediction_field,
        args.smiles_field, args.split_field, args.id_field, args.threshold,
        args.delimiter)

  print "+++++++++++++++++++++++++++++++++"
  print "Perform train-test split"
  paths = [data_dir]
  weight_positives = False  # Hard coding this for now
  train_out = os.path.join(args.out, "%s-train.pkl.gz" % args.name)
  test_out = os.path.join(args.out, "%s-test.pkl.gz" % args.name)
  _train_test_input(paths, args.output_transforms,
      args.input_transforms, args.feature_types, args.splittype,
      weight_positives, args.mode, train_out, test_out)

  print "+++++++++++++++++++++++++++++++++"
  print "Fit model"
  modeltype = get_model_type(args.model)
  extension = get_model_extension(modeltype)
  saved_out = os.path.join(data_dir, "%s.%s" % (args.model, extension))
  _fit_model(paths, args.model, args.task_type, args.n_hidden,
      args.learning_rate, args.dropout, args.n_epochs, args.decay, args.batch_size,
      args.validation_split, saved_out, train_out)


  print "+++++++++++++++++++++++++++++++++"
  print "Eval Model on Train"
  print "-------------------"
  csv_out_train = os.path.join(data_dir, "%s-train.csv" % args.model)
  csv_out_test = os.path.join(data_dir, "%s-test.csv" % args.model)
  compute_aucs, compute_recall, compute_accuracy, compute_matthews_corrcoef = (
    False, False, False, False)
  compute_r2s, compute_rms = False, False 
  if args.task_type == "classification":
    compute_aucs, compute_recall, compute_accuracy, compute_matthews_corrcoef = (
        True, True, True, True)
  elif args.task_type == "regression":
    compute_r2s, compute_rms = True, True
  _eval_trained_model(modeltype, saved_out, train_out,
      paths, args.task_type, compute_aucs, compute_recall,
      compute_accuracy, compute_matthews_corrcoef, compute_r2s,
      compute_rms, csv_out_train)
  print "Eval Model on Test"
  print "------------------"
  _eval_trained_model(modeltype, saved_out, test_out,
      paths, args.task_type, compute_aucs, compute_recall,
      compute_accuracy, compute_matthews_corrcoef, compute_r2s,
      compute_rms, csv_out_test)

def parse_args(input_args=None):
  """Parse command-line arguments."""
@@ -256,6 +291,8 @@ def _featurize_input(name, out, input_file, input_type, fields, field_types, fea
  """Featurizes raw input data."""
  if len(fields) != len(field_types):
    raise ValueError("number of fields does not equal number of field types")
  if id_field is None:
    id_field = smiles_field
  out_x_pkl, out_y_pkl = generate_directories(name, out, 
      feature_fields)
  df, mols = extract_data(input_file, input_type, fields,
@@ -303,11 +340,12 @@ def _train_test_input(paths, output_transforms, input_transforms,

def fit_model(args):
  # TODO(rbharath): Bundle these arguments up into a training_params dict.
  _fit_model(paths, model, task_type, n_hidden, learning_rate, dropout,
      n_epochs, decay, batch_size, validation_split, saved_out)
  _fit_model(args.paths, args.model, args.task_type, args.n_hidden,
      args.learning_rate, args.dropout, args.n_epochs, args.decay, args.batch_size,
      args.validation_split, args.saved_out, args.saved_data)

def _fit_model(paths, model, task_type, n_hidden, learning_rate, dropout,
      n_epochs, decay, batch_size, validation_split, saved)out):
      n_epochs, decay, batch_size, validation_split, saved_out, saved_data):
  """Builds model from featurized data."""
  targets = get_target_names(paths)
  task_types = {target: task_type for target in targets}
@@ -316,37 +354,50 @@ def _fit_model(paths, model, task_type, n_hidden, learning_rate, dropout,
    stored_train = pickle.load(f)
  train_dict = stored_train["transformed"]

  if args.model == "singletask_deep_network":
  if model == "singletask_deep_network":
    from deep_chem.models.deep import fit_singletask_mlp
    models = fit_singletask_mlp(train_dict, task_types, n_hidden=n_hidden,
      learning_rate=learning_rate, dropout=dropout,
      nb_epoch=n_epochs, decay=decay, batch_size=batch_size,
      validation_split=validation_split)
  elif args.model == "multitask_deep_network":
  elif model == "multitask_deep_network":
    from deep_chem.models.deep import fit_multitask_mlp
    models = fit_multitask_mlp(train_dict, task_types,
      n_hidden=n_hidden, learning_rate = learning_rate,
      dropout=dropout, batch_size=batch_size,
      nb_epoch=n_epochs, decay=decay,
      validation_split=validation_split)
  elif args.model == "3D_cnn":
  elif model == "3D_cnn":
    from deep_chem.models.deep3d import fit_3D_convolution
    models = fit_3D_convolution(train_data, test_data, task_types,
        nb_epoch=n_epochs, batch_size=batch_size)
  else:
    models = fit_singletask_models(train_dict, args.model, task_types)
  if args.model in ["singletask_deep_network", "multitask_deep_network", "3D_cnn"]:
    models = fit_singletask_models(train_dict, model, task_types)
  modeltype = get_model_type(model)
  save_model(models, modeltype, saved_out)

def get_model_type(model):
  if model in ["singletask_deep_network", "multitask_deep_network", "3D_cnn"]:
    modeltype = "keras"
  else:
    modeltype = "sklearn"
  save_model(models, modeltype, saved_out)
  return modeltype

def get_model_extension(modeltype):
  if modeltype == "sklearn":
    return "joblib"
  elif modeltype == "autograd":
    return "pkl.gz"
  elif modeltype == "keras":
    return "h5"

def eval_trained_model(args):
  _eval_trained_model(modeltype, saved_model, saved_data, paths, task_type,
      compute_aucs, compute_recall, compute_accuracy, compute_matthews_corrcoef,
      compute_r2s, compute_rms, csv_out)
  _eval_trained_model(args.modeltype, args.saved_model, args.saved_data,
      args.paths, args.task_type, args.compute_aucs, args.compute_recall,
      args.compute_accuracy, args.compute_matthews_corrcoef, args.compute_r2s,
      args.compute_rms, args.csv_out)

def eval_trained_model(modeltype, saved_model, saved_data, paths, task_type,
def _eval_trained_model(modeltype, saved_model, saved_data, paths, task_type,
    compute_aucs, compute_recall, compute_accuracy, compute_matthews_corrcoef,
    compute_r2s, compute_rms, csv_out):
  model = load_model(modeltype, saved_model)
+38 −39
Original line number Diff line number Diff line
@@ -17,7 +17,7 @@ from vs_utils.utils import SmilesGenerator, ScaffoldGenerator
from vs_utils.features.fingerprints import CircularFingerprint
from vs_utils.features.basic import SimpleDescriptors

def generate_directories(name, out, feature_endpoints):
def generate_directories(name, out, feature_fields):
  """Generate directory structure for featurized dataset."""
  dataset_dir = os.path.join(out, name)
  if not os.path.exists(dataset_dir):
@@ -31,17 +31,17 @@ def generate_directories(name, out, feature_endpoints):
  target_dir = os.path.join(dataset_dir, "targets")
  if not os.path.exists(target_dir):
    os.makedirs(target_dir)
  if feature_endpoints is not None:
    feature_endpoint_dir = os.path.join(dataset_dir, "features")
    if not os.path.exists(feature_endpoint_dir):
      os.makedirs(feature_endpoint_dir)
  if feature_fields is not None:
    feature_field_dir = os.path.join(dataset_dir, "features")
    if not os.path.exists(feature_field_dir):
      os.makedirs(feature_field_dir)

  # Return names of files to be generated
  # TODO(rbharath): Explicitly passing around out_*_pkl is an encapsulation
  # failure. Remove this.
  out_y_pkl = os.path.join(target_dir, "%s.pkl.gz" % name)
  out_x_pkl = (os.path.join(feature_endpoint_dir, "%s-features.pkl.gz" %name)
      if feature_endpoints is not None else None)
  out_x_pkl = (os.path.join(feature_field_dir, "%s-features.pkl.gz" %name)
      if feature_fields is not None else None)
  return out_x_pkl, out_y_pkl

def parse_float_input(val):
@@ -57,7 +57,7 @@ def parse_float_input(val):
    if ">" in val or "<" in val or "-" in val:
      return np.nan

def generate_vs_utils_features(df, name, out, smiles_endpoint, id_endpoint, featuretype):
def generate_vs_utils_features(df, name, out, smiles_field, id_field, featuretype):
  """Generates circular fingerprints for dataset."""
  dataset_dir = os.path.join(out, name)
  feature_dir = os.path.join(dataset_dir, featuretype)
@@ -65,16 +65,16 @@ def generate_vs_utils_features(df, name, out, smiles_endpoint, id_endpoint, feat
      "%s-%s.pkl.gz" % (name, featuretype))

  feature_df = pd.DataFrame([]) 
  feature_df["smiles"] = df[[smiles_endpoint]]
  feature_df["scaffolds"] = df[[smiles_endpoint]].apply(
    functools.partial(generate_scaffold, smiles_endpoint=smiles_endpoint),
  feature_df["smiles"] = df[[smiles_field]]
  feature_df["scaffolds"] = df[[smiles_field]].apply(
    functools.partial(generate_scaffold, smiles_field=smiles_field),
    axis=1)
  feature_df["mol_id"] = df[[id_endpoint]]
  feature_df["mol_id"] = df[[id_field]]

  mols = []
  for row in df.iterrows():
    # pandas rows are tuples (row_num, row_data)
    smiles = row[1][smiles_endpoint]
    smiles = row[1][smiles_field]
    mols.append(Chem.MolFromSmiles(smiles))
  if featuretype == "fingerprints":
    featurizer = CircularFingerprint(size=1024)
@@ -125,7 +125,7 @@ def get_colnames(row, input_type):
  elif input_type == "csv":
    return row

def get_row_data(row, input_type, fields, smiles_endpoint, colnames=None):
def get_row_data(row, input_type, fields, smiles_field, colnames=None):
  """Extract information from row data."""
  row_data = {}
  if input_type == "xlsx":
@@ -142,11 +142,10 @@ def get_row_data(row, input_type, fields, smiles_endpoint, colnames=None):
    for field in fields:
      row_data[field] = row[field]
  elif input_type == "sdf":
    mol = {}
    mol = row 
    for field in fields:
      if field == smiles_endpoint:
        row_data[field] = Chem.MolToSmiles(mol)
      elif not mol.HasProp(field):
      row_data[smiles_field] = Chem.MolToSmiles(mol)
      if not mol.HasProp(field):
        row_data[field] = None
      else:
        row_data[field] = mol.GetProp(field)
@@ -168,46 +167,46 @@ def process_field(data, field_type):
  elif field_type == "ndarray":
    return data 

def generate_targets(df, mols, prediction_endpoint, split_endpoint,
    smiles_endpoint, id_endpoint, out_pkl):
def generate_targets(df, mols, prediction_field, split_field,
    smiles_field, id_field, out_pkl):
  """Process input data file, generate labels, i.e. y"""
  #TODO(enf, rbharath): Modify package unique identifier to take user-specified 
    #unique identifier instead of assuming smiles string
  labels_df = pd.DataFrame([])
  labels_df["mol_id"] = df[[id_endpoint]]
  labels_df["smiles"] = df[[smiles_endpoint]]
  labels_df["prediction"] = df[[prediction_endpoint]]
  if split_endpoint is not None:
    labels_df["split"] = df[[split_endpoint]]
  labels_df["mol_id"] = df[[id_field]]
  labels_df["smiles"] = df[[smiles_field]]
  labels_df["prediction"] = df[[prediction_field]]
  if split_field is not None:
    labels_df["split"] = df[[split_field]]

  # Write pkl.gz file
  with gzip.open(out_pkl, "wb") as f:
    pickle.dump(labels_df, f, pickle.HIGHEST_PROTOCOL)

def generate_scaffold(smiles_elt, include_chirality=False, smiles_endpoint="smiles"):
  smiles_string = smiles_elt[smiles_endpoint]
def generate_scaffold(smiles_elt, include_chirality=False, smiles_field="smiles"):
  smiles_string = smiles_elt[smiles_field]
  mol = Chem.MolFromSmiles(smiles_string)
  engine = ScaffoldGenerator(include_chirality=include_chirality)
  scaffold = engine.get_scaffold(mol)
  return(scaffold)

def generate_features(df, feature_endpoints, smiles_endpoint, id_endpoint, out_pkl):
  if feature_endpoints is None:
    print("No feature endpoint specified by user.")
def generate_features(df, feature_fields, smiles_field, id_field, out_pkl):
  if feature_fields is None:
    print("No feature field specified by user.")
    return

  features_df = pd.DataFrame([]) 
  features_df["smiles"] = df[[smiles_endpoint]]
  features_df["scaffolds"] = df[[smiles_endpoint]].apply(
    functools.partial(generate_scaffold, smiles_endpoint=smiles_endpoint),
  features_df["smiles"] = df[[smiles_field]]
  features_df["scaffolds"] = df[[smiles_field]].apply(
    functools.partial(generate_scaffold, smiles_field=smiles_field),
    axis=1)
  features_df["mol_id"] = df[[id_endpoint]]
  features_df["mol_id"] = df[[id_field]]

  features_data = []
  for row in df.iterrows():
    # pandas rows are tuples (row_num, row_data)
    row, feature_list = row[1], []
    for feature in feature_endpoints:
    for feature in feature_fields:
      feature_list.append(row[feature])
    features_data.append({"row": np.array(feature_list)})
  features_df["features"] = pd.DataFrame(features_data)
@@ -216,7 +215,7 @@ def generate_features(df, feature_endpoints, smiles_endpoint, id_endpoint, out_p
    pickle.dump(features_df, f, pickle.HIGHEST_PROTOCOL)

def extract_data(input_file, input_type, fields, field_types, 
      prediction_endpoint, smiles_endpoint, threshold, delimiter):
      prediction_field, smiles_field, threshold, delimiter):
  """Extracts data from input as Pandas data frame"""
  rows, mols, smiles = [], [], SmilesGenerator()
  colnames = [] 
@@ -231,14 +230,14 @@ def extract_data(input_file, input_type, fields, field_types,
    if (input_type == "xlsx" or input_type == "csv") and row_index == 0:  
      colnames = get_colnames(raw_row, input_type)
      continue
    row, row_data = {}, get_row_data(raw_row, input_type, fields, smiles_endpoint, colnames)
    row, row_data = {}, get_row_data(raw_row, input_type, fields, smiles_field, colnames)
    for ind, (field, field_type) in enumerate(zip(fields, field_types)):
      if field == prediction_endpoint and threshold is not None:
      if field == prediction_field and threshold is not None:
        raw_val = process_field(row_data[field], field_type)
        row[field] = 1 if raw_val > threshold else 0 
      else:
        row[field] = process_field(row_data[field], field_type)
    mol = Chem.MolFromSmiles(row[smiles_endpoint])
    mol = Chem.MolFromSmiles(row_data[smiles_field])
    row["smiles"] = smiles.get_smiles(mol)
    mols.append(mol)
    rows.append(row)