Commit c9a61899 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

First round of changes to run from package.

parent 205b9176
Loading
Loading
Loading
Loading
+13 −11
Original line number Diff line number Diff line
@@ -2,21 +2,23 @@
Code for processing the Google vs-datasets using keras.
"""
import numpy as np
import sys
import keras
from keras.models import Graph
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD
from dataset_arxiv import load_datasets
from dataset_arxiv import multitask_to_singletask
from dataset_arxiv import train_test_random_split
from dataset_arxiv import train_test_scaffold_split
from dataset_arxiv import dataset_to_numpy
from dataset_arxiv import to_one_hot
from dataset_arxiv import eval_model
from dataset_arxiv import compute_r2_scores
from dataset_arxiv import compute_rms_scores
from dataset_arxiv import compute_roc_auc_scores
from dataset_arxiv import load_and_transform_dataset
from deep_chem.utils.load import load_datasets
from deep_chem.utils.preprocess import multitask_to_singletask
from deep_chem.utils.preprocess import train_test_random_split
from deep_chem.utils.preprocess import train_test_scaffold_split
from deep_chem.utils.preprocess import dataset_to_numpy
from deep_chem.utils.preprocess import to_one_hot
from deep_chem.utils.evaluate import eval_model
from deep_chem.utils.evaluate import compute_r2_scores
from deep_chem.utils.evaluate import compute_rms_scores
from deep_chem.utils.evaluate import compute_roc_auc_scores
from deep_chem.utils.load import load_and_transform_dataset

def process_multitask(paths, task_transforms, desc_transforms, splittype="random",
    seed=None, add_descriptors=False, desc_weight=0.5):
+14 −14
Original line number Diff line number Diff line
@@ -2,15 +2,15 @@
Code for processing the Google vs-datasets using scikit-learn.
"""
import numpy as np
from dataset_arxiv import load_and_transform_dataset
from dataset_arxiv import multitask_to_singletask
from dataset_arxiv import train_test_random_split
from dataset_arxiv import train_test_scaffold_split
from dataset_arxiv import dataset_to_numpy
from dataset_arxiv import eval_model
from dataset_arxiv import compute_r2_scores
from dataset_arxiv import compute_rms_scores
from dataset_arxiv import compute_roc_auc_scores
from deep_chem.utils.load import load_and_transform_dataset
from deep_chem.utils.preprocess import multitask_to_singletask
from deep_chem.utils.preprocess import train_test_random_split
from deep_chem.utils.preprocess import train_test_scaffold_split
from deep_chem.utils.preprocess import dataset_to_numpy
from deep_chem.utils.evaluate import eval_model
from deep_chem.utils.evaluate import compute_r2_scores
from deep_chem.utils.evaluate import compute_rms_scores
from deep_chem.utils.evaluate import compute_roc_auc_scores
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import MultiTaskLasso 
@@ -61,17 +61,17 @@ def fit_singletask_models(paths, modeltype, task_types, task_transforms,
      raise ValueError("Improper splittype. Must be random/scaffold.")
    X_train, y_train, W_train = dataset_to_numpy(train)
    X_test, y_test, W_test = dataset_to_numpy(test)
    if modeltype == "random_forest_regressor":
    if modeltype == "rf_regressor":
      model = RandomForestRegressor(n_estimators=500, n_jobs=-1,
          warm_start=True, max_features="sqrt")
    elif modeltype == "random_forest_classifier":
    elif modeltype == "rf_classifier":
      model = RandomForestClassifier(n_estimators=500, n_jobs=-1,
          warm_start=True, max_features="sqrt")
    elif modeltype == "logistic_regression":
    elif modeltype == "logistic":
      model = LogisticRegression(class_weight="auto")
    elif modeltype == "linear_regression":
    elif modeltype == "linear":
      model = LinearRegression(normalize=True)
    elif modeltype == "ridge_regression":
    elif modeltype == "ridge":
      model = RidgeCV(alphas=[0.01, 0.1, 1.0, 10.0], normalize=True) 
    elif modeltype == "lasso":
      model = LassoCV(max_iter=2000, n_jobs=-1) 
+16 −14
Original line number Diff line number Diff line
"""
Convenience script to train basic models on supported datasets.
"""
import argparse
import numpy as np
from deep_chem.models.keras import fit_singletask_mlp
from deep_chem.models.keras import fit_multitask_mlp
from deep_chem.models.keras import train_multitask_model
from deep_chem.models.sklearn import fit_singletask_models
from deep_chem.models.sklearn import fit_multitask_rf
from deep_chem.models.deep import fit_singletask_mlp
from deep_chem.models.deep import fit_multitask_mlp
from deep_chem.models.deep import train_multitask_model
from deep_chem.models.standard import fit_singletask_models
from deep_chem.models.standard import fit_multitask_rf
from deep_chem.utils.analysis import compare_datasets
from deep_chem.utils.evaluate import eval_model
from deep_chem.utils.evaluate import compute_roc_auc_scores
@@ -20,6 +21,8 @@ from deep_chem.utils.preprocess import train_test_random_split
from deep_chem.utils.preprocess import train_test_scaffold_split
from deep_chem.utils.preprocess import scaffold_separate
from deep_chem.utils.preprocess import multitask_to_singletask
from deep_chem.utils.preprocess import get_default_task_types_and_transforms
from deep_chem.utils.preprocess import get_default_descriptor_transforms

def parse_args(input_args=None):
  """Parse command-line arguments."""
@@ -29,11 +32,10 @@ def parse_args(input_args=None):
  parser.add_argument('--dataset', required=1, choices=['muv', 'pcba', 'dude', 'pfizer'],
                      help='Name of dataset to process.')
  parser.add_argument('--model', required=1, nargs="+",
                      choices=["logistic_regression", "random_forest", "single_task_deep_network"])
                      choices=["logistic", "rf_classifier", "single_task_deep_network"])
  return parser.parse_args(input_args)


if __name__ == "__main__":
def main():
  args = parse_args()
  if args.dataset == "muv":
    path = "/home/rbharath/vs-datasets/muv"
@@ -52,11 +54,7 @@ if __name__ == "__main__":

  if len(args.model) == 1:
    model = args.model[0]
    if model == "logistic_regression":
      fit_singletask_models([path], "logistic_regression", task_types,
          task_transforms, splittype="scaffold")
    elif model == "random_forest":
      fit_singletask_models([path], "random_forest", task_types,
    fit_singletask_models([path], model, task_types,
        task_transforms, splittype="scaffold")

  #fit_multitask_mlp([muv_path, pfizer_path], task_types, task_transforms,
@@ -76,3 +74,7 @@ if __name__ == "__main__":
  #fit_singletask_mlp([muv_path], task_types, task_transforms, desc_transforms,
  #  splittype="scaffold", add_descriptors=False, n_hidden=500,
  #  learning_rate=.01, dropout=.5, nb_epoch=30, decay=1e-4)


if __name__ == "__main__":
  main()
+6 −0
Original line number Diff line number Diff line
@@ -5,6 +5,12 @@ __author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2015, Stanford University"
__license__ = "LGPL"

import gzip
import numpy as np
import os
import cPickle as pickle
from deep_chem.utils.preprocess import transform_outputs

def load_descriptors(paths, descriptor_dir_name="descriptors"):
  """Load dataset descriptors and return.

+6 −2
Original line number Diff line number Diff line
@@ -5,9 +5,13 @@ __author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2015, Stanford University"
__license__ = "LGPL"

import numpy as np
from deep_chem.utils.load import get_target_names

def get_default_descriptor_transforms():
  """Provides default descriptor transforms for rdkit descriptors."""
  # TODO(rbharath): Remove these magic numbers 
  desc_transforms = {}
  n_descriptors = 196 - 39
  for desc in range(n_descriptors):
    desc_transforms[desc] = ["normalize"]
@@ -22,7 +26,7 @@ def get_default_task_types_and_transforms(dataset_specs):
    Maps name of datasets to filepath.
  """
  task_types, task_transforms = {}, {}
  for name, path in dataset_specs.itervalues():
  for name, path in dataset_specs.iteritems():
    targets = get_target_names([path])
    if name == "muv" or name == "dude" or name == "pcba":
      for target in targets:
@@ -33,7 +37,7 @@ def get_default_task_types_and_transforms(dataset_specs):
        task_types[target] = "regression"
        task_transforms[target] = ["log", "normalize"]
    elif name == "pdbbind":

      raise ValueError("pdbbind not yet supported!")
  return task_types, task_transforms

def transform_outputs(dataset, task_transforms, desc_transforms={},
Loading