Commit f4a665f8 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

pylint fixes and python3 upgrades

parent adc366b5
Loading
Loading
Loading
Loading
+17 −19
Original line number Diff line number Diff line
@@ -2,10 +2,8 @@
Code for processing the Google vs-datasets using keras.
"""
import numpy as np
import keras
from keras.models import Graph
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.core import Dense, Dropout
from keras.optimizers import SGD
from deep_chem.utils.preprocess import to_one_hot

@@ -58,11 +56,12 @@ def fit_singletask_mlp(train_data, task_types, **training_params):
    task_y_train = y_train[flat_W_train.nonzero()]
    print "%d compounds in Train" % len(train_ids)
    models[task] = train_multitask_model(task_X_train, task_y_train, W_train,
        {task: task_types[task]}, **training_params)
                                         {task: task_types[task]},
                                         **training_params)
  return models

def train_multitask_model(X, y, W, task_types,
  learning_rate=0.01, decay=1e-6, momentum=0.9, nesterov=True, activation="relu",
def train_multitask_model(X, y, W, task_types, learning_rate=0.01,
                          decay=1e-6, momentum=0.9, nesterov=True, activation="relu",
                          dropout=0.5, nb_epoch=20, batch_size=50, n_hidden=500,
                          validation_split=0.1):
  """
@@ -92,7 +91,6 @@ def train_multitask_model(X, y, W, task_types,
    maximal number of epochs to run the optimizer
  """
  eps = .001
  num_tasks = len(task_types)
  sorted_tasks = sorted(task_types.keys())
  local_task_types = task_types.copy()
  endpoints = sorted_tasks
+12 −15
Original line number Diff line number Diff line
@@ -7,7 +7,7 @@ from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution3D, MaxPooling3D

def fit_3D_convolution(train_data, task_types, **training_params):
def fit_3D_convolution(train_data, **training_params):
  """
  Perform stochastic gradient descent for a 3D CNN.
  """
@@ -17,7 +17,6 @@ def fit_3D_convolution(train_data, task_types, **training_params):
    raise ValueError("3D Convolutions only supported for singletask.")
  task_name = train_data["sorted_tasks"][0]
  (y_train, _) = train_data["sorted_tasks"].itervalues().next()
  nb_classes = 2
  models[task_name] = train_3D_convolution(X_train, y_train, **training_params)
  return models

@@ -39,8 +38,6 @@ def train_3D_convolution(X, y, batch_size=50, nb_epoch=1,learning_rate=0.01,
  (n_samples, axis_length, _, _, n_channels) = np.shape(X)
  X = np.reshape(X, (n_samples, axis_length, n_channels, axis_length, axis_length))
  print "Final shape of X: " + str(np.shape(X))
  # Number of classes for classification
  nb_classes = 2

  # number of convolutional filters to use at each layer
  nb_filters = [axis_length/2, axis_length, axis_length]
@@ -53,8 +50,8 @@ def train_3D_convolution(X, y, batch_size=50, nb_epoch=1,learning_rate=0.01,

  model = Sequential()
  model.add(Convolution3D(nb_filter=nb_filters[0], stack_size=n_channels,
     nb_row=nb_conv[0], nb_col=nb_conv[0], nb_depth=nb_conv[0],
     border_mode='valid'))
                          nb_row=nb_conv[0], nb_col=nb_conv[0],
                          nb_depth=nb_conv[0], border_mode='valid'))
  model.add(Activation('relu'))
  model.add(MaxPooling3D(poolsize=(nb_pool[0], nb_pool[0], nb_pool[0])))
  model.add(Convolution3D(nb_filter=nb_filters[1], stack_size=nb_filters[0],
@@ -63,18 +60,18 @@ def train_3D_convolution(X, y, batch_size=50, nb_epoch=1,learning_rate=0.01,
  model.add(Activation('relu'))
  model.add(MaxPooling3D(poolsize=(nb_pool[1], nb_pool[1], nb_pool[1])))
  model.add(Convolution3D(nb_filter=nb_filters[2], stack_size=nb_filters[1],
     nb_row=nb_conv[2], nb_col=nb_conv[2], nb_depth=nb_conv[2],
     border_mode='valid'))
                          nb_row=nb_conv[2], nb_col=nb_conv[2],
                          nb_depth=nb_conv[2], border_mode='valid'))
  model.add(Activation('relu'))
  model.add(MaxPooling3D(poolsize=(nb_pool[2], nb_pool[2], nb_pool[2])))
  model.add(Flatten())
  # TODO(rbharath): If we change away from axis-size 32, this code will break.
  # Eventually figure out a more general rule that works for all axis sizes.
  model.add(Dense(32, 32/2, init='normal'))
  model.add(Dense(32/2, init='normal'))
  model.add(Activation('relu'))
  model.add(Dropout(0.5))
  # TODO(rbharath): Generalize this to support classification as well as regression.
  model.add(Dense(32/2, 1, init='normal'))
  model.add(Dense(1, init='normal'))

  sgd = RMSprop(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True)
  print "About to compile model"
+0 −11
Original line number Diff line number Diff line
@@ -69,14 +69,3 @@ def fit_singletask_models(train_data, modeltype):
    model.fit(task_X_train, task_y_train.ravel())
    models[task] = model
  return models

## TODO(rbharath): I believe this is broken. Update it to work with the rest of
## the package.
#def fit_multitask_rf(train_data):
#  """Fits a multitask RF model to provided dataset.
#  """
#  (_, X_train, y_train, _) = train_data
#  model = RandomForestClassifier(
#      n_estimators=100, n_jobs=-1, class_weight="auto")
#  model.fit(X_train, y_train)
#  return model
+35 −51
Original line number Diff line number Diff line
"""
Top level script to featurize input, train models, and evaluate them.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals
import argparse
import gzip
import cPickle as pickle
import joblib
import os
from deep_chem.utils.featurize import generate_directories
from deep_chem.utils.featurize import extract_data
@@ -12,7 +12,6 @@ from deep_chem.utils.featurize import generate_targets
from deep_chem.utils.featurize import generate_features
from deep_chem.utils.featurize import generate_vs_utils_features
from deep_chem.models.standard import fit_singletask_models
#from deep_chem.utils.load import get_target_names
from deep_chem.utils.load import process_datasets
from deep_chem.utils.load import transform_data
from deep_chem.utils.evaluate import results_to_csv
@@ -171,11 +170,6 @@ def add_fit_command(subparsers):
  group.add_argument(
      "--saved-data", required=1,
      help="Location of saved transformed data.")
  # TODO(rbharath): CODE SMELL. This shouldn't be shuttled around
  group.add_argument(
      "--paths", nargs="+", required=1,
      help="Paths to input datasets.")

  add_model_group(fit_cmd)
  group = fit_cmd.add_argument_group("save")
  group.add_argument(
@@ -195,10 +189,6 @@ def add_eval_command(subparsers):
      help="Location from which to load saved model.")
  group.add_argument(
      "--saved-data", required=1, help="Location of saved transformed data.")
  # TODO(rbharath): CODE SMELL. This shouldn't be shuttled around
  group.add_argument(
      "--paths", nargs="+", required=1,
      help="Paths to input datasets.")
  group.add_argument(
      "--modeltype", required=1,
      choices=["sklearn", "keras-graph", "keras-sequential"],
@@ -291,8 +281,8 @@ def add_model_command(subparsers):
def create_model(args):
  """Creates a model"""
  data_dir = os.path.join(args.out, args.name)
  print "+++++++++++++++++++++++++++++++++"
  print "Perform featurization"
  print("+++++++++++++++++++++++++++++++++")
  print("Perform featurization")
  if not args.skip_featurization:
    _featurize_input(
        args.name, args.out, args.input_file, args.input_type, args.fields,
@@ -300,8 +290,8 @@ def create_model(args):
        args.smiles_field, args.split_field, args.id_field, args.threshold,
        args.delimiter)

  print "+++++++++++++++++++++++++++++++++"
  print "Perform train-test split"
  print("+++++++++++++++++++++++++++++++++")
  print("Perform train-test split")
  paths = [data_dir]
  weight_positives = False  # Hard coding this for now
  train_out = os.path.join(data_dir, "%s-train.joblib" % args.name)
@@ -312,21 +302,21 @@ def create_model(args):
        args.splittype, weight_positives, args.mode, train_out, test_out,
        args.target_fields)

  print "+++++++++++++++++++++++++++++++++"
  print "Fit model"
  print("+++++++++++++++++++++++++++++++++")
  print("Fit model")
  modeltype = get_model_type(args.model)
  extension = get_model_extension(modeltype)
  saved_out = os.path.join(data_dir, "%s.%s" % (args.model, extension))
  if not args.skip_fit:
    _fit_model(
        paths, args.model, args.task_type, args.n_hidden, args.learning_rate,
        args.model, args.task_type, args.n_hidden, args.learning_rate,
        args.dropout, args.n_epochs, args.decay, args.batch_size, args.loss_function,
        args.validation_split, saved_out, train_out, args.target_fields)


  print "+++++++++++++++++++++++++++++++++"
  print "Eval Model on Train"
  print "-------------------"
  print("+++++++++++++++++++++++++++++++++")
  print("Eval Model on Train")
  print("-------------------")
  csv_out_train = os.path.join(data_dir, "%s-train.csv" % args.name)
  stats_out_train = os.path.join(data_dir, "%s-train-stats.txt" % args.name)
  csv_out_test = os.path.join(data_dir, "%s-test.csv" % args.name)
@@ -334,24 +324,22 @@ def create_model(args):
  compute_aucs, compute_recall, compute_accuracy, compute_matthews_corrcoef = (
      False, False, False, False)
  compute_r2s, compute_rms = False, False
  print "create_model()"
  print "args.task_type"
  print args.task_type
  print("create_model()")
  print("args.task_type")
  print(args.task_type)
  if args.task_type == "classification":
    compute_aucs, compute_recall, compute_accuracy, compute_matthews_corrcoef = (
        True, True, True, True)
  elif args.task_type == "regression":
    compute_r2s, compute_rms = True, True
  _eval_trained_model(
      modeltype, saved_out, train_out, paths, args.task_type, compute_aucs,
      modeltype, saved_out, train_out, args.task_type, compute_aucs,
      compute_recall, compute_accuracy, compute_matthews_corrcoef, compute_r2s,
      compute_rms, csv_out_train, stats_out_train, args.target_fields)
  print "(compute_aucs, compute_recall, compute_accuracy, compute_matthews_corrcoef, compute_r2s, compute_rms)"
  print (compute_aucs, compute_recall, compute_accuracy, compute_matthews_corrcoef, compute_r2s, compute_rms)
  print "Eval Model on Test"
  print "------------------"
  print("Eval Model on Test")
  print("------------------")
  _eval_trained_model(
      modeltype, saved_out, test_out, paths, args.task_type, compute_aucs,
      modeltype, saved_out, test_out, args.task_type, compute_aucs,
      compute_recall, compute_accuracy, compute_matthews_corrcoef, compute_r2s,
      compute_rms, csv_out_test, stats_out_test, args.target_fields)

@@ -386,17 +374,17 @@ def _featurize_input(name, out, input_file, input_type, fields, field_types,
  if id_field is None:
    id_field = smiles_field
  out_x_pkl, out_y_pkl = generate_directories(name, out, feature_fields)
  df, mols = extract_data(
  df, _ = extract_data(
      input_file, input_type, fields, field_types, target_fields,
      smiles_field, threshold, delimiter)
  print "Generating targets"
  print("Generating targets")
  generate_targets(df, target_fields, split_field,
                   smiles_field, id_field, out_y_pkl)
  print "Generating user-specified features"
  print("Generating user-specified features")
  generate_features(df, feature_fields, smiles_field, id_field, out_x_pkl)
  print "Generating circular fingerprints"
  print("Generating circular fingerprints")
  generate_vs_utils_features(df, name, out, smiles_field, id_field, "fingerprints")
  print "Generating rdkit descriptors"
  print("Generating rdkit descriptors")
  generate_vs_utils_features(df, name, out, smiles_field, id_field, "descriptors")

def train_test_input(args):
@@ -418,10 +406,8 @@ def _train_test_input(paths, output_transforms, input_transforms,
  feature_types = feature_types.split(",")
  print("About to process_dataset")
  train_dict, test_dict = process_datasets(
      paths, input_transforms, output_transforms_dict,
      feature_types=feature_types, splittype=splittype,
      weight_positives=weight_positives, mode=mode,
      target_names=target_names)
      paths, feature_types=feature_types, splittype=splittype,
      mode=mode, target_names=target_names)
  print("Finished process_dataset")

  print("Starting transform_data")
@@ -446,16 +432,15 @@ def fit_model(args):
  """Wrapper that calls _fit_model with arguments unwrapped."""
  # TODO(rbharath): Bundle these arguments up into a training_params dict.
  _fit_model(
      args.paths, args.model, args.task_type, args.n_hidden,
      args.model, args.task_type, args.n_hidden,
      args.learning_rate, args.dropout, args.n_epochs, args.decay,
      args.batch_size, args.loss_function, args.validation_split,
      args.saved_out, args.saved_data, args.target_fields)

def _fit_model(paths, model, task_type, n_hidden, learning_rate, dropout,
def _fit_model(model, task_type, n_hidden, learning_rate, dropout,
               n_epochs, decay, batch_size, loss_function, validation_split, saved_out,
               saved_data, target_names):
  """Builds model from featurized data."""
  #targets = get_target_names(paths)
  task_types = {target: task_type for target in target_names}

  stored_train = load_sharded_dataset(saved_data)
@@ -476,7 +461,7 @@ def _fit_model(paths, model, task_type, n_hidden, learning_rate, dropout,
  elif model == "3D_cnn":
    from deep_chem.models.deep3d import fit_3D_convolution
    models = fit_3D_convolution(
        train_dict, task_types, nb_epoch=n_epochs, batch_size=batch_size,
        train_dict, nb_epoch=n_epochs, batch_size=batch_size,
        learning_rate=learning_rate, loss_function=loss_function)
  else:
    models = fit_singletask_models(train_dict, model)
@@ -507,19 +492,18 @@ def get_model_extension(modeltype):
def eval_trained_model(args):
  """Wrapper function that calls _eval_trained_model with unwrapped args."""
  _eval_trained_model(
      args.modeltype, args.saved_model, args.saved_data, args.paths,
      args.modeltype, args.saved_model, args.saved_data,
      args.task_type, args.compute_aucs, args.compute_recall,
      args.compute_accuracy, args.compute_matthews_corrcoef, args.compute_r2s,
      args.compute_rms, args.csv_out, args.stats_out,
      args.target_fields)

def _eval_trained_model(modeltype, saved_model, saved_data, paths, task_type,
def _eval_trained_model(modeltype, saved_model, saved_data, task_type,
                        compute_aucs, compute_recall, compute_accuracy,
                        compute_matthews_corrcoef, compute_r2s, compute_rms,
                        csv_out, stats_out, target_names):
  """Evaluates a trained model on specified data."""
  model = load_model(modeltype, saved_model)
  #targets = get_target_names(paths)
  task_types = {target: task_type for target in target_names}

  stored_test = load_sharded_dataset(saved_data)
@@ -534,7 +518,7 @@ def _eval_trained_model(modeltype, saved_model, saved_data, paths, task_type,
        recall=compute_recall, accuracy=compute_accuracy,
        mcc=compute_matthews_corrcoef, print_file=stats_file)
  with open(stats_out, "r") as stats_file:
    print stats_file.read()
    print(stats_file.read())
  results_to_csv(results, csv_out, task_type=task_type)

def main():

deep_chem/utils/analysis.py

deleted100644 → 0
+0 −128
Original line number Diff line number Diff line
"""
Utility functions to compare datasets to one another.
"""
__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2015, Stanford University"
__license__ = "LGPL"

import numpy as np

def results_to_csv(results_dict):
  """Pretty prints results as CSV line."""
  targets = sorted(results_dict.keys())
  print ",".join(targets)
  print ",".join([str(results_dict[target]) for target in targets])

def summarize_distribution(y):
  """Analyzes regression dataset.

  Parameters
  ----------
  y: np.ndarray 
    A 1D numpy array containing distribution.
  """
  mean = np.mean(y)
  std = np.std(y)
  minval = np.amin(y)
  maxval = np.amax(y)
  hist = np.histogram(y)
  print "Mean: %f" % mean
  print "Std: %f" % std
  print "Min: %f" % minval
  print "Max: %f" % maxval
  print "Histogram: "
  print hist

#def analyze_data(dataset, splittype="random"):
#  """Analyzes regression dataset.
#
#  Parameters
#  ----------
#  dataset: dict
#    A dictionary of type produced by load_datasets.
#  splittype: string
#    Type of split for train/test. Either random or scaffold.
#  """
#  singletask = multitask_to_singletask(dataset)
#  for target in singletask:
#    data = singletask[target]
#    if len(data.keys()) == 0:
#      continue
#    if splittype == "random":
#      train, test = train_test_random_split(data, seed=0)
#    elif splittype == "scaffold":
#      train, test = train_test_scaffold_split(data)
#    else:
#      raise ValueError("Improper splittype. Must be random/scaffold.")
#    _, Xtrain, ytrain, _ = dataset_to_numpy(train)
#    # TODO(rbharath): Take this out once debugging is completed
#    ytrain = np.log(ytrain)
#    mean = np.mean(ytrain)
#    std = np.std(ytrain)
#    minval = np.amin(ytrain)
#    maxval = np.amax(ytrain)
#    hist = np.histogram(ytrain)
#    print target
#    print "Mean: %f" % mean
#    print "Std: %f" % std
#    print "Min: %f" % minval
#    print "Max: %f" % maxval
#    print "Histogram: "
#    print hist


def compare_all_datasets():
  """Compare all datasets in our collection.

  TODO(rbharath): Make this actually robust.
  """
  muv_path = "/home/rbharath/vs-datasets/muv"
  pcba_path = "/home/rbharath/vs-datasets/pcba"
  dude_path = "/home/rbharath/vs-datasets/dude"
  pfizer_path = "/home/rbharath/private-datasets/pfizer"
  muv_data = load_datasets([muv_path])
  pcba_data = load_datasets([pcba_path])
  dude_data = load_datasets([dude_path])
  pfizer_data = load_datasets([pfizer_path])
  print "----------------------"
  compare_datasets("muv", muv_data, "pcba", pcba_data)
  print "----------------------"
  compare_datasets("pfizer", pfizer_data, "muv", muv_data)
  print "----------------------"
  compare_datasets("pfizer", pfizer_data, "pcba", pcba_data)
  print "----------------------"
  compare_datasets("muv", muv_data, "dude", dude_data)
  print "----------------------"
  compare_datasets("pcba", pcba_data, "dude", dude_data)
  print "----------------------"
  compare_datasets("pfizer", pfizer_data, "dude", dude_data)

def compare_datasets(first_name, first, second_name, second):
  """Counts the overlap between two provided datasets.

  Parameters
  ----------
  first_name: string
    Name of first dataset
  first: dict
    Data dictionary generated by load_datasets.
  second_name: string
    Name of second dataset
  second: dict
    Data dictionary generated by load_datasets.
  """
  first_scaffolds = set()
  for key in first:
    _, scaffold, _ = first[key]
    first_scaffolds.add(scaffold)
  print "%d molecules in %s dataset" % (len(first), first_name)
  print "%d scaffolds in %s dataset" % (len(first_scaffolds), first_name)
  second_scaffolds = set()
  for key in second:
    _, scaffold, _ = second[key]
    second_scaffolds.add(scaffold)
  print "%d molecules in %s dataset" % (len(second), second_name)
  print "%d scaffolds in %s dataset" % (len(second_scaffolds), second_name)
  common_scaffolds = first_scaffolds.intersection(second_scaffolds)
  print "%d scaffolds in both" % len(common_scaffolds)
Loading