Commit ad722420 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Changes

parent af2db874
Loading
Loading
Loading
Loading
+1 −5
Original line number Diff line number Diff line
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Tue Mar  7 00:07:10 2017

@author: zqwu
This file holds the current best set of hyperparameters for the Molnet benchmark.
"""
import deepchem

+59 −160
Original line number Diff line number Diff line
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 06 14:25:40 2017

@author: Zhenqin Wu
This file provides utilities to run the MoleculeNet benchmark suite.
"""
import os
import time
import csv
import logging
import numpy as np
import tensorflow as tf
import deepchem
@@ -15,6 +13,43 @@ from deepchem.molnet.run_benchmark_models import benchmark_classification, bench
from deepchem.molnet.check_availability import CheckFeaturizer, CheckSplit
from deepchem.molnet.preset_hyper_parameters import hps

logger = logging.getLogger(__name__)

# Loading functions available
loading_functions = {
    'bace_c': deepchem.molnet.load_bace_classification,
    'bace_r': deepchem.molnet.load_bace_regression,
    'bbbp': deepchem.molnet.load_bbbp,
    'chembl': deepchem.molnet.load_chembl,
    'clearance': deepchem.molnet.load_clearance,
    'clintox': deepchem.molnet.load_clintox,
    'delaney': deepchem.molnet.load_delaney,
    'factors': deepchem.molnet.load_factors,
    'hiv': deepchem.molnet.load_hiv,
    'hopv': deepchem.molnet.load_hopv,
    'hppb': deepchem.molnet.load_hppb,
    'kaggle': deepchem.molnet.load_kaggle,
    'kinase': deepchem.molnet.load_kinase,
    'lipo': deepchem.molnet.load_lipo,
    'muv': deepchem.molnet.load_muv,
    'nci': deepchem.molnet.load_nci,
    'pcba': deepchem.molnet.load_pcba,
    'pcba_146': deepchem.molnet.load_pcba_146,
    'pcba_2475': deepchem.molnet.load_pcba_2475,
    'pdbbind': deepchem.molnet.load_pdbbind_grid,
    'ppb': deepchem.molnet.load_ppb,
    'qm7': deepchem.molnet.load_qm7_from_mat,
    'qm7b': deepchem.molnet.load_qm7b_from_mat,
    'qm8': deepchem.molnet.load_qm8,
    'qm9': deepchem.molnet.load_qm9,
    'sampl': deepchem.molnet.load_sampl,
    'sider': deepchem.molnet.load_sider,
    'thermosol': deepchem.molnet.load_thermosol,
    'tox21': deepchem.molnet.load_tox21,
    'toxcast': deepchem.molnet.load_toxcast,
    'uv': deepchem.molnet.load_uv,
}


def run_benchmark(datasets,
                  model,
@@ -31,16 +66,21 @@ def run_benchmark(datasets,
                  test=False,
                  reload=True,
                  seed=123):
  """
  Run benchmark test on designated datasets with deepchem(or user-defined) model
  """Run MoleculeNet benchmark suite.

  This is a utility function to help run the MoleculeNet benchmark
  suite on a specified model and a specified dataset.

  Run benchmark test on designated datasets with deepchem(or
  user-defined) model.

  Parameters
  ----------
  datasets: list of string
      choice of which datasets to use, should be: bace_c, bace_r, bbbp, chembl,
      clearance, clintox, delaney, hiv, hopv, kaggle, lipo, muv, nci, pcba,
      pdbbind, ppb, qm7, qm7b, qm8, qm9, sampl, sider, tox21, toxcast, uv, factors,
      kinase
      choice of which datasets to use, should be one of: bace_c,
      bace_r, bbbp, chembl, clearance, clintox, delaney, hiv, hopv,
      kaggle, lipo, muv, nci, pcba, pdbbind, ppb, qm7, qm7b, qm8, qm9,
      sampl, sider, tox21, toxcast, uv, factors, kinase
  model: string or user-defined model stucture
      choice of which model to use, deepchem provides implementation of
      logistic regression, random forest, multitask network,
@@ -49,10 +89,10 @@ def run_benchmark(datasets,
  split: string,  optional (default=None)
      choice of splitter function, None = using the default splitter
  metric: string, optional (default=None)
      choice of evaluation metrics, None = using the default metrics(AUC & R2)
  direction: bool, optional(default=True)
      Optimization direction when doing hyperparameter search
      Maximization(True) or minimization(False)
      Choice of evaluation metrics, None = using the default metrics(AUC & R2)
  use_max: bool, (default True)
    Specifies whether to maximize or minimize `metric`.
    maximization(True) or minimization(False)
  featurizer: string or dc.feat.Featurizer,  optional (default=None)
      choice of featurization, None = using the default corresponding to model
      (string only applicable to deepchem models)
@@ -110,46 +150,12 @@ def run_benchmark(datasets,
    if not split in [None] + CheckSplit[dataset]:
      continue

    loading_functions = {
        'bace_c': deepchem.molnet.load_bace_classification,
        'bace_r': deepchem.molnet.load_bace_regression,
        'bbbp': deepchem.molnet.load_bbbp,
        'chembl': deepchem.molnet.load_chembl,
        'clearance': deepchem.molnet.load_clearance,
        'clintox': deepchem.molnet.load_clintox,
        'delaney': deepchem.molnet.load_delaney,
        'factors': deepchem.molnet.load_factors,
        'hiv': deepchem.molnet.load_hiv,
        'hopv': deepchem.molnet.load_hopv,
        'hppb': deepchem.molnet.load_hppb,
        'kaggle': deepchem.molnet.load_kaggle,
        'kinase': deepchem.molnet.load_kinase,
        'lipo': deepchem.molnet.load_lipo,
        'muv': deepchem.molnet.load_muv,
        'nci': deepchem.molnet.load_nci,
        'pcba': deepchem.molnet.load_pcba,
        'pcba_146': deepchem.molnet.load_pcba_146,
        'pcba_2475': deepchem.molnet.load_pcba_2475,
        'pdbbind': deepchem.molnet.load_pdbbind_grid,
        'ppb': deepchem.molnet.load_ppb,
        'qm7': deepchem.molnet.load_qm7_from_mat,
        'qm7b': deepchem.molnet.load_qm7b_from_mat,
        'qm8': deepchem.molnet.load_qm8,
        'qm9': deepchem.molnet.load_qm9,
        'sampl': deepchem.molnet.load_sampl,
        'sider': deepchem.molnet.load_sider,
        'thermosol': deepchem.molnet.load_thermosol,
        'tox21': deepchem.molnet.load_tox21,
        'toxcast': deepchem.molnet.load_toxcast,
        'uv': deepchem.molnet.load_uv,
    }

    print('-------------------------------------')
    print('Benchmark on dataset: %s' % dataset)
    print('-------------------------------------')
    logger.info('-------------------------------------')
    logger.info('Benchmark on dataset: %s' % dataset)
    logger.info('-------------------------------------')
    # loading datasets
    if split is not None:
      print('Splitting function: %s' % split)
      logger.info('Splitting function: %s' % split)
      tasks, all_dataset, transformers = loading_functions[dataset](
          featurizer=featurizer, split=split, reload=reload)
    else:
@@ -173,8 +179,7 @@ def run_benchmark(datasets,
          valid_dataset,
          transformers,
          metric,
          direction=direction,
          n_features=n_features,
          use_max=use_max,
          n_tasks=len(tasks),
          max_iter=max_iter,
          search_range=search_range)
@@ -187,7 +192,6 @@ def run_benchmark(datasets,
            test_dataset,
            tasks,
            transformers,
            n_features,
            metric,
            model,
            test=test,
@@ -235,108 +239,3 @@ def run_benchmark(datasets,
    if hyper_param_search:
      with open(os.path.join(out_path, dataset + model + '.pkl'), 'w') as f:
        pickle.dump(hyper_parameters, f)


#
# Note by @XericZephyr. Reason why I spun off this function:
#   1. Some model needs dataset information.
#   2. It offers us possibility to **cache** the dataset
#      if the featurizer runs very slow, e.g., GraphConv.
#   2+. The cache can even happen at Travis CI to accelerate
#       CI testing.
#
def load_dataset(dataset, featurizer, split='random'):
  """
  Load specific dataset for benchmark.

  Parameters
  ----------
  dataset: string
      choice of which datasets to use, should be: tox21, muv, sider,
      toxcast, pcba, delaney, factors, hiv, hopv, kaggle, kinase, nci,
      clintox, hiv, pcba_128, pcba_146, pdbbind, chembl, qm7, qm7b, qm9,
      sampl, uv
  featurizer: string or dc.feat.Featurizer.
      choice of featurization.
  split: string,  optional (default=None)
      choice of splitter function, None = using the default splitter
  """
  dataset_loading_functions = {
      'bace_c': deepchem.molnet.load_bace_classification,
      'bace_r': deepchem.molnet.load_bace_regression,
      'bbbp': deepchem.molnet.load_bbbp,
      'chembl': deepchem.molnet.load_chembl,
      'clearance': deepchem.molnet.load_clearance,
      'clintox': deepchem.molnet.load_clintox,
      'delaney': deepchem.molnet.load_delaney,
      'factors': deepchem.molnet.load_factors,
      'hiv': deepchem.molnet.load_hiv,
      'hopv': deepchem.molnet.load_hopv,
      'hppb': deepchem.molnet.load_hppb,
      'kaggle': deepchem.molnet.load_kaggle,
      'kinase': deepchem.molnet.load_kinase,
      'lipo': deepchem.molnet.load_lipo,
      'muv': deepchem.molnet.load_muv,
      'nci': deepchem.molnet.load_nci,
      'pcba': deepchem.molnet.load_pcba,
      'pcba_128': deepchem.molnet.load_pcba_128,
      'pcba_146': deepchem.molnet.load_pcba_146,
      'pcba_2475': deepchem.molnet.load_pcba_2475,
      'pdbbind': deepchem.molnet.load_pdbbind_grid,
      'ppb': deepchem.molnet.load_ppb,
      'qm7': deepchem.molnet.load_qm7_from_mat,
      'qm7b': deepchem.molnet.load_qm7b_from_mat,
      'qm8': deepchem.molnet.load_qm8,
      'qm9': deepchem.molnet.load_qm9,
      'sampl': deepchem.molnet.load_sampl,
      'sider': deepchem.molnet.load_sider,
      'thermosol': deepchem.molnet.load_thermosol,
      'tox21': deepchem.molnet.load_tox21,
      'toxcast': deepchem.molnet.load_toxcast,
      'uv': deepchem.molnet.load_uv
  }
  print('-------------------------------------')
  print('Loading dataset: %s' % dataset)
  print('-------------------------------------')
  # loading datasets
  if split is not None:
    print('Splitting function: %s' % split)
  tasks, all_dataset, transformers = dataset_loading_functions[dataset](
      featurizer=featurizer, split=split)
  return tasks, all_dataset, transformers


def benchmark_model(model, all_dataset, transformers, metric, test=False):
  """
  Benchmark custom model.

  model: user-defined model stucture
    For user define model, it should include function: fit, evaluate.

  all_dataset: (train, test, val) data tuple.
    Returned by `load_dataset` function.

  transformers

  metric: string
    choice of evaluation metrics.


  """
  time_start_fitting = time.time()
  train_score = .0
  valid_score = .0
  test_score = .0

  train_dataset, valid_dataset, test_dataset = all_dataset

  model.fit(train_dataset)
  train_score = model.evaluate(train_dataset, metric, transformers)
  valid_score = model.evaluate(valid_dataset, metric, transformers)
  if test:
    test_score = model.evaluate(test_dataset, metric, transformers)

  time_finish_fitting = time.time()
  time_for_running = time_finish_fitting - time_start_fitting

  return train_score, valid_score, test_score, time_for_running
+0 −2
Original line number Diff line number Diff line
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Mon Mar  6 23:41:26 2017

+19 −0
Original line number Diff line number Diff line
@@ -22,6 +22,25 @@ nevertheless, deep learning systems can't simply chew up raw files.
For this reason, :code:`deepchem` provides an extensive collection of
featurization methods which we will review on this page.

Featurizer-Model Matchups
-------------------------

If you're using DeepChem in practical applications, you probably want
to use a given model on some dataset. Your first question when you try
to do this will probably be which featurizer should I use?

+------------+--------------------------+-----------+
| Model      | Acceptable Featurizers   | Header 3  |
+============+==========================+===========+
| body row 1 | column 2   | column 3  |
+------------+------------+-----------+
| body row 2 | Cells may span columns.|
+------------+------------+-----------+
| body row 3 | Cells may  | - Cells   |
+------------+ span rows. | - contain |
| body row 4 |            | - blocks. |
+------------+------------+-----------+

Featurizer
----------

+52 −32
Original line number Diff line number Diff line
@@ -2,121 +2,141 @@ MoleculeNet
===========
The DeepChem library is packaged alongside the MoleculeNet suite of datasets. One of the most important parts of machine learning applications is finding a suitable dataset. The MoleculeNet suite has curated a whole range of datasets and loaded them into DeepChem :code:`dc.data.Dataset` objects for convenience.

Running Benchmark
-----------------

At present, there is only support for running benchmark models

.. autofunction:: deepchem.molnet.run_benchmark

Best Known Hyperparameters
--------------------------

MoleculeNet maintains a list of the currently best known
hyperparameters for various models on MoleculeNet benchmarks.

MoleculeNet Datasets
--------------------

MoleculeNet is actively maintained and contains a growing set of
different datasets. Here are the set of currently available
MoleculeNet datasets.

BACE Dataset
------------
^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_bace_classification

.. autofunction:: deepchem.molnet.load_bace_regression

BBBC Datasets
-------------
^^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_bbbc001

.. autofunction:: deepchem.molnet.load_bbbc002

BBBP Datasets
-------------
^^^^^^^^^^^^^
BBBP stands for Blood-Brain-Barrier Penetration

.. autofunction:: deepchem.molnet.load_bbbp

Cell Counting Datasets
----------------------
^^^^^^^^^^^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_cell_counting

Chembl Datasets
---------------
^^^^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_chembl

Chembl25 Datasets
---------------
^^^^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_chembl25

Clearance Datasets
------------------
^^^^^^^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_clearance

Clintox Datasets
----------------
^^^^^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_clintox

Delaney Datasets
----------------
^^^^^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_delaney

Factors Datasets
----------------
^^^^^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_factors

HIV Datasets
------------
^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_hiv

HOPV Datasets
-------------
^^^^^^^^^^^^^
HOPV stands for the Harvard Organic Photovoltaic Dataset.

.. autofunction:: deepchem.molnet.load_hopv

HPPB Datasets
-------------
^^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_hppb


KAGGLE Datasets
---------------
^^^^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_kaggle

Kinase Datasets
---------------
^^^^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_kinase


Lipo Datasets
-------------
^^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_lipo

MUV Datasets
------------
^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_muv

NCI Datasets
------------
^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_nci

PCBA Datasets
-------------
^^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_pcba

PDBBIND Datasets
----------------
^^^^^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_pdbbind

PPB Datasets
------------
^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_ppb

QM7 Datasets
------------
^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_qm7

@@ -125,54 +145,54 @@ QM7 Datasets
.. autofunction:: deepchem.molnet.load_qm7b_from_mat

QM8 Datasets
------------
^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_qm8

QM9 Datasets
------------
^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_qm9


SAMPL Datasets
--------------
^^^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_sampl


SIDER Datasets
--------------
^^^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_sider

SWEETLEAD Datasets
------------------
^^^^^^^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_sweetlead

Thermosol Datasets
------------------
^^^^^^^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_thermosol


Tox21 Datasets
--------------
^^^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_tox21

Toxcast Datasets
----------------
^^^^^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_toxcast

USPTO Datasets
--------------
^^^^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_uspto

UV Datasets
-----------
^^^^^^^^^^^

.. autofunction:: deepchem.molnet.load_uv