Commit 5d54bc74 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Removed extraneous imports and updated CSV files to display molecular IDs

parent 15f8d3aa
Loading
Loading
Loading
Loading
+3 −13
Original line number Diff line number Diff line
@@ -2,22 +2,12 @@
Code for processing the Google vs-datasets using keras.
"""
import numpy as np
import sys
import keras
from keras.models import Graph
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD
from deep_chem.utils.load import load_datasets
from deep_chem.utils.load import ensure_balanced
from deep_chem.utils.preprocess import multitask_to_singletask
from deep_chem.utils.preprocess import split_dataset
from deep_chem.utils.preprocess import dataset_to_numpy
from deep_chem.utils.preprocess import to_one_hot
from deep_chem.utils.evaluate import eval_model
from deep_chem.utils.evaluate import compute_r2_scores
from deep_chem.utils.evaluate import compute_rms_scores
from deep_chem.utils.evaluate import compute_roc_auc_scores


def fit_multitask_mlp(per_task_data, task_types, **training_params):
@@ -36,7 +26,7 @@ def fit_multitask_mlp(per_task_data, task_types, **training_params):
  models = {}
  # Follows convention from process_datasets that the data for multitask models
  # is grouped under key "all"
  (train, X_train, y_train, W_train), (test, X_test, y_test, W_test) = (
  (_, X_train, y_train, W_train), (test, X_test, y_test, W_test) = (
      per_task_data["all"])
  models["all"] = train_multitask_model(X_train, y_train, W_train, task_types,
                                **training_params)
@@ -59,9 +49,9 @@ def fit_singletask_mlp(per_task_data, task_types, **training_params):
  for index, target in enumerate(sorted(per_task_data.keys())):
    print "Training model %d" % index
    print "Target %s" % target
    (train, X_train, y_train, W_train), (test, X_test, y_test, W_test) = (
    (train_ids, X_train, y_train, W_train), (test, X_test, y_test, W_test) = (
        per_task_data[target])
    print "%d compounds in Train" % len(train)
    print "%d compounds in Train" % len(train_ids)
    print "%d compounds in Test" % len(test)
    models[target] = train_multitask_model(X_train, y_train, W_train,
        {target: task_types[target]}, **training_params)
+1 −5
Original line number Diff line number Diff line
@@ -6,17 +6,13 @@ from keras.optimizers import RMSprop
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution3D, MaxPooling3D
from keras.utils import np_utils
from deep_chem.utils.preprocess import split_dataset
from deep_chem.utils.evaluate import eval_model
from deep_chem.utils.evaluate import compute_r2_scores

def fit_3D_convolution(per_task_data, task_types, **training_params):
  """
  Perform stochastic gradient descent for a 3D CNN.
  """
  models = {}
  (train, X_train, y_train, _), _ = per_task_data["all"]
  (_, X_train, y_train, _), _ = per_task_data["all"]
  nb_classes = 2
  models["all"] = train_3D_convolution(X_train, y_train, **training_params)
  return models
+3 −12
Original line number Diff line number Diff line
@@ -2,13 +2,6 @@
Code for processing datasets using scikit-learn.
"""
import numpy as np
from deep_chem.utils.analysis import results_to_csv
from deep_chem.utils.preprocess import split_dataset
from deep_chem.utils.preprocess import dataset_to_numpy
from deep_chem.utils.evaluate import eval_model
from deep_chem.utils.evaluate import compute_r2_scores
from deep_chem.utils.evaluate import compute_rms_scores
from deep_chem.utils.evaluate import compute_roc_auc_scores
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import MultiTaskLasso 
@@ -43,7 +36,7 @@ def fit_singletask_models(per_task_data, modeltype, task_types):
  models = {}
  for index, target in enumerate(sorted(per_task_data.keys())):
    print "Building model %d" % index
    (train, X_train, y_train, W_train), (test, X_test, y_test, W_test) = (
    (_, X_train, y_train, W_train), (test, X_test, y_test, W_test) = (
        per_task_data[target])
    if modeltype == "rf_regressor":
      model = RandomForestRegressor(n_estimators=500, n_jobs=-1,
@@ -74,11 +67,9 @@ def fit_singletask_models(per_task_data, modeltype, task_types):
def fit_multitask_rf(train_data, test_data, task_types):
  """Fits a multitask RF model to provided dataset.
  """
  (train, X_train, y_train, W_train), (test, X_train, y_train, W_train) = (
  (_, X_train, y_train, W_train), (test, X_train, y_train, W_train) = (
      train_data, test_data) 
  model = RandomForestClassifier(n_estimators=100, n_jobs=-1,
      class_weight="auto")
  model.fit(X_train, y_train)
  results = eval_model(test, model, task_types)
  scores = compute_roc_auc_scores(results)
  print "Mean AUC: %f" % np.mean(np.array(scores.values()))
  return model
+2 −2
Original line number Diff line number Diff line
@@ -133,7 +133,7 @@ def parse_args(input_args=None):
  group = train_cmd.add_argument_group("save")
  group.add_argument("--saved-out", type=str, required=1,
                  help="Location to save trained model.")
  train_cmd.set_defaults(func=train_model)
  train_cmd.set_defaults(func=fit_model)

  eval_cmd = subparsers.add_parser("eval",
                help="Evaluate trained model on test data processed by transform.")
@@ -199,7 +199,7 @@ def train_test_input(args):
  with gzip.open(args.out, "wb") as f:
    pickle.dump(per_task_data, f)

def train_model(args):
def fit_model(args):
  """Builds model from featurized data."""
  targets = get_target_names(args.paths)
  task_types = {target: args.task_type for target in targets}
+1 −1
Original line number Diff line number Diff line
@@ -54,7 +54,7 @@ def analyze_data(dataset, splittype="random"):
      train, test = train_test_scaffold_split(data)
    else:
      raise ValueError("Improper splittype. Must be random/scaffold.")
    Xtrain, ytrain = dataset_to_numpy(train)
    _, Xtrain, ytrain, _ = dataset_to_numpy(train)
    # TODO(rbharath): Take this out once debugging is completed
    ytrain = np.log(ytrain)
    mean = np.mean(ytrain)
Loading