Commit cc99dfb5 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

cleaning up changes

parent 608ef37a
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -77,7 +77,7 @@ class Model(BaseEstimator):
    raise NotImplementedError(
        "Each model is responsible for its own predict_on_batch method.")

  def restore(self):
  def reload(self):
    """
    Reload trained model from disk.
    """
+1 −1
Original line number Diff line number Diff line
@@ -92,7 +92,7 @@ class SklearnModel(Model):
    """Saves sklearn model to disk using joblib."""
    save_to_disk(self.model_instance, self.get_model_filename(self.model_dir))

  def restore(self):
  def reload(self):
    """Loads sklearn model from joblib file on disk."""
    self.model_instance = load_from_disk(
        Model.get_model_filename(self.model_dir))
+160 −59
Original line number Diff line number Diff line
# -*- coding: utf-8 -*-
"""
This file provides utilities to run the MoleculeNet benchmark suite.
Created on Mon Mar 06 14:25:40 2017

@author: Zhenqin Wu
"""
import os
import time
import csv
import logging
import numpy as np
import tensorflow as tf
import deepchem
@@ -13,43 +15,6 @@ from deepchem.molnet.run_benchmark_models import benchmark_classification, bench
from deepchem.molnet.check_availability import CheckFeaturizer, CheckSplit
from deepchem.molnet.preset_hyper_parameters import hps

logger = logging.getLogger(__name__)

# Loading functions available
loading_functions = {
    'bace_c': deepchem.molnet.load_bace_classification,
    'bace_r': deepchem.molnet.load_bace_regression,
    'bbbp': deepchem.molnet.load_bbbp,
    'chembl': deepchem.molnet.load_chembl,
    'clearance': deepchem.molnet.load_clearance,
    'clintox': deepchem.molnet.load_clintox,
    'delaney': deepchem.molnet.load_delaney,
    'factors': deepchem.molnet.load_factors,
    'hiv': deepchem.molnet.load_hiv,
    'hopv': deepchem.molnet.load_hopv,
    'hppb': deepchem.molnet.load_hppb,
    'kaggle': deepchem.molnet.load_kaggle,
    'kinase': deepchem.molnet.load_kinase,
    'lipo': deepchem.molnet.load_lipo,
    'muv': deepchem.molnet.load_muv,
    'nci': deepchem.molnet.load_nci,
    'pcba': deepchem.molnet.load_pcba,
    'pcba_146': deepchem.molnet.load_pcba_146,
    'pcba_2475': deepchem.molnet.load_pcba_2475,
    'pdbbind': deepchem.molnet.load_pdbbind_grid,
    'ppb': deepchem.molnet.load_ppb,
    'qm7': deepchem.molnet.load_qm7_from_mat,
    'qm7b': deepchem.molnet.load_qm7b_from_mat,
    'qm8': deepchem.molnet.load_qm8,
    'qm9': deepchem.molnet.load_qm9,
    'sampl': deepchem.molnet.load_sampl,
    'sider': deepchem.molnet.load_sider,
    'thermosol': deepchem.molnet.load_thermosol,
    'tox21': deepchem.molnet.load_tox21,
    'toxcast': deepchem.molnet.load_toxcast,
    'uv': deepchem.molnet.load_uv,
}


def run_benchmark(datasets,
                  model,
@@ -66,21 +31,16 @@ def run_benchmark(datasets,
                  test=False,
                  reload=True,
                  seed=123):
  """Run MoleculeNet benchmark suite.

  This is a utility function to help run the MoleculeNet benchmark
  suite on a specified model and a specified dataset.

  Run benchmark test on designated datasets with deepchem(or
  user-defined) model.
  """
  Run benchmark test on designated datasets with deepchem(or user-defined) model

  Parameters
  ----------
  datasets: list of string
      choice of which datasets to use, should be one of: bace_c,
      bace_r, bbbp, chembl, clearance, clintox, delaney, hiv, hopv,
      kaggle, lipo, muv, nci, pcba, pdbbind, ppb, qm7, qm7b, qm8, qm9,
      sampl, sider, tox21, toxcast, uv, factors, kinase
      choice of which datasets to use, should be: bace_c, bace_r, bbbp, chembl,
      clearance, clintox, delaney, hiv, hopv, kaggle, lipo, muv, nci, pcba,
      pdbbind, ppb, qm7, qm7b, qm8, qm9, sampl, sider, tox21, toxcast, uv, factors,
      kinase
  model: string or user-defined model stucture
      choice of which model to use, deepchem provides implementation of
      logistic regression, random forest, multitask network,
@@ -89,10 +49,10 @@ def run_benchmark(datasets,
  split: string,  optional (default=None)
      choice of splitter function, None = using the default splitter
  metric: string, optional (default=None)
      Choice of evaluation metrics, None = using the default metrics(AUC & R2)
  use_max: bool, (default True)
    Specifies whether to maximize or minimize `metric`.
    maximization(True) or minimization(False)
      choice of evaluation metrics, None = using the default metrics(AUC & R2)
  direction: bool, optional(default=True)
      Optimization direction when doing hyperparameter search
      Maximization(True) or minimization(False)
  featurizer: string or dc.feat.Featurizer,  optional (default=None)
      choice of featurization, None = using the default corresponding to model
      (string only applicable to deepchem models)
@@ -150,12 +110,46 @@ def run_benchmark(datasets,
    if not split in [None] + CheckSplit[dataset]:
      continue

    logger.info('-------------------------------------')
    logger.info('Benchmark on dataset: %s' % dataset)
    logger.info('-------------------------------------')
    loading_functions = {
        'bace_c': deepchem.molnet.load_bace_classification,
        'bace_r': deepchem.molnet.load_bace_regression,
        'bbbp': deepchem.molnet.load_bbbp,
        'chembl': deepchem.molnet.load_chembl,
        'clearance': deepchem.molnet.load_clearance,
        'clintox': deepchem.molnet.load_clintox,
        'delaney': deepchem.molnet.load_delaney,
        'factors': deepchem.molnet.load_factors,
        'hiv': deepchem.molnet.load_hiv,
        'hopv': deepchem.molnet.load_hopv,
        'hppb': deepchem.molnet.load_hppb,
        'kaggle': deepchem.molnet.load_kaggle,
        'kinase': deepchem.molnet.load_kinase,
        'lipo': deepchem.molnet.load_lipo,
        'muv': deepchem.molnet.load_muv,
        'nci': deepchem.molnet.load_nci,
        'pcba': deepchem.molnet.load_pcba,
        'pcba_146': deepchem.molnet.load_pcba_146,
        'pcba_2475': deepchem.molnet.load_pcba_2475,
        'pdbbind': deepchem.molnet.load_pdbbind_grid,
        'ppb': deepchem.molnet.load_ppb,
        'qm7': deepchem.molnet.load_qm7_from_mat,
        'qm7b': deepchem.molnet.load_qm7b_from_mat,
        'qm8': deepchem.molnet.load_qm8,
        'qm9': deepchem.molnet.load_qm9,
        'sampl': deepchem.molnet.load_sampl,
        'sider': deepchem.molnet.load_sider,
        'thermosol': deepchem.molnet.load_thermosol,
        'tox21': deepchem.molnet.load_tox21,
        'toxcast': deepchem.molnet.load_toxcast,
        'uv': deepchem.molnet.load_uv,
    }

    print('-------------------------------------')
    print('Benchmark on dataset: %s' % dataset)
    print('-------------------------------------')
    # loading datasets
    if split is not None:
      logger.info('Splitting function: %s' % split)
      print('Splitting function: %s' % split)
      tasks, all_dataset, transformers = loading_functions[dataset](
          featurizer=featurizer, split=split, reload=reload)
    else:
@@ -179,7 +173,8 @@ def run_benchmark(datasets,
          valid_dataset,
          transformers,
          metric,
          use_max=use_max,
          direction=direction,
          n_features=n_features,
          n_tasks=len(tasks),
          max_iter=max_iter,
          search_range=search_range)
@@ -192,6 +187,7 @@ def run_benchmark(datasets,
            test_dataset,
            tasks,
            transformers,
            n_features,
            metric,
            model,
            test=test,
@@ -239,3 +235,108 @@ def run_benchmark(datasets,
    if hyper_param_search:
      with open(os.path.join(out_path, dataset + model + '.pkl'), 'w') as f:
        pickle.dump(hyper_parameters, f)


#
# Note by @XericZephyr. Reason why I spun off this function:
#   1. Some model needs dataset information.
#   2. It offers us possibility to **cache** the dataset
#      if the featurizer runs very slow, e.g., GraphConv.
#   2+. The cache can even happen at Travis CI to accelerate
#       CI testing.
#
def load_dataset(dataset, featurizer, split='random'):
  """
  Load specific dataset for benchmark.

  Parameters
  ----------
  dataset: string
      choice of which datasets to use, should be: tox21, muv, sider,
      toxcast, pcba, delaney, factors, hiv, hopv, kaggle, kinase, nci,
      clintox, hiv, pcba_128, pcba_146, pdbbind, chembl, qm7, qm7b, qm9,
      sampl, uv
  featurizer: string or dc.feat.Featurizer.
      choice of featurization.
  split: string,  optional (default=None)
      choice of splitter function, None = using the default splitter
  """
  dataset_loading_functions = {
      'bace_c': deepchem.molnet.load_bace_classification,
      'bace_r': deepchem.molnet.load_bace_regression,
      'bbbp': deepchem.molnet.load_bbbp,
      'chembl': deepchem.molnet.load_chembl,
      'clearance': deepchem.molnet.load_clearance,
      'clintox': deepchem.molnet.load_clintox,
      'delaney': deepchem.molnet.load_delaney,
      'factors': deepchem.molnet.load_factors,
      'hiv': deepchem.molnet.load_hiv,
      'hopv': deepchem.molnet.load_hopv,
      'hppb': deepchem.molnet.load_hppb,
      'kaggle': deepchem.molnet.load_kaggle,
      'kinase': deepchem.molnet.load_kinase,
      'lipo': deepchem.molnet.load_lipo,
      'muv': deepchem.molnet.load_muv,
      'nci': deepchem.molnet.load_nci,
      'pcba': deepchem.molnet.load_pcba,
      'pcba_128': deepchem.molnet.load_pcba_128,
      'pcba_146': deepchem.molnet.load_pcba_146,
      'pcba_2475': deepchem.molnet.load_pcba_2475,
      'pdbbind': deepchem.molnet.load_pdbbind_grid,
      'ppb': deepchem.molnet.load_ppb,
      'qm7': deepchem.molnet.load_qm7_from_mat,
      'qm7b': deepchem.molnet.load_qm7b_from_mat,
      'qm8': deepchem.molnet.load_qm8,
      'qm9': deepchem.molnet.load_qm9,
      'sampl': deepchem.molnet.load_sampl,
      'sider': deepchem.molnet.load_sider,
      'thermosol': deepchem.molnet.load_thermosol,
      'tox21': deepchem.molnet.load_tox21,
      'toxcast': deepchem.molnet.load_toxcast,
      'uv': deepchem.molnet.load_uv
  }
  print('-------------------------------------')
  print('Loading dataset: %s' % dataset)
  print('-------------------------------------')
  # loading datasets
  if split is not None:
    print('Splitting function: %s' % split)
  tasks, all_dataset, transformers = dataset_loading_functions[dataset](
      featurizer=featurizer, split=split)
  return tasks, all_dataset, transformers


def benchmark_model(model, all_dataset, transformers, metric, test=False):
  """
  Benchmark custom model.

  model: user-defined model stucture
    For user define model, it should include function: fit, evaluate.

  all_dataset: (train, test, val) data tuple.
    Returned by `load_dataset` function.

  transformers

  metric: string
    choice of evaluation metrics.


  """
  time_start_fitting = time.time()
  train_score = .0
  valid_score = .0
  test_score = .0

  train_dataset, valid_dataset, test_dataset = all_dataset

  model.fit(train_dataset)
  train_score = model.evaluate(train_dataset, metric, transformers)
  valid_score = model.evaluate(valid_dataset, metric, transformers)
  if test:
    test_score = model.evaluate(test_dataset, metric, transformers)

  time_finish_fitting = time.time()
  time_for_running = time_finish_fitting - time_start_fitting

  return train_score, valid_score, test_score, time_for_running
+2 −0
Original line number Diff line number Diff line
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Mon Mar  6 23:41:26 2017

+0 −19
Original line number Diff line number Diff line
@@ -22,25 +22,6 @@ nevertheless, deep learning systems can't simply chew up raw files.
For this reason, :code:`deepchem` provides an extensive collection of
featurization methods which we will review on this page.

Featurizer-Model Matchups
-------------------------

If you're using DeepChem in practical applications, you probably want
to use a given model on some dataset. Your first question when you try
to do this will probably be which featurizer should I use?

+------------+--------------------------+-----------+
| Model      | Acceptable Featurizers   | Header 3  |
+============+==========================+===========+
| body row 1 | column 2   | column 3  |
+------------+------------+-----------+
| body row 2 | Cells may span columns.|
+------------+------------+-----------+
| body row 3 | Cells may  | - Cells   |
+------------+ span rows. | - contain |
| body row 4 |            | - blocks. |
+------------+------------+-----------+

Featurizer
----------

Loading