Commit f95d82c8 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #302 from miaecle/kaggle

update README file & benchmark script
parents 46e906a7 32793806
Loading
Loading
Loading
Loading
+336 −244
Original line number Diff line number Diff line
# DeepChem
# DeepChem

DeepChem aims to provide a high quality open-source toolchain that
democratizes the use of deep-learning in drug discovery, materials science, and quantum
@@ -203,23 +203,115 @@ passed a ``Featurizer`` object. DeepChem provides a number of
different subclasses of ``Featurizer`` for convenience:

### Performances
|Dataset    |N(tasks)	|N(samples) |Model               |Train score/ROC-AUC|Valid score/ROC-AUC|Time(loading)/s |Time(running)/s|
|-----------|-----------|-----------|--------------------|-------------------|-------------------|----------------|---------------| 
|tox21      |12         |8014       |logistic regression |0.910              |0.759              |30              |30             |
|           |           |           |tensorflow(MT-NN)   |0.987              |0.800              |30              |30             |
|           |           |           |graph convolution   |0.930              |0.819              |40              |40             |
|muv        |17         |93127      |logistic regression |0.910              |0.744              |600             |800            |
|           |           |           |tensorflow(MT-NN)   |0.980              |0.710              |600             |800            |
|           |           |           |graph convolution   |0.881              |0.832              |800             |1200           |
|pcba       |128        |439863     |logistic regression |0.794        	     |0.762              |1800            |15000          |                                         
|           |           |           |tensorflow(MT-NN)	 |0.949        	     |0.791              |1800            |15000          |                                         
|           |           |           |graph convolution   |0.866        	     |0.836              |2200            |20000          |                                         
|sider      |27         |1427       |logistic regression |0.900        	     |0.620              |15              |40             |                                         
|           |           |           |tensorflow(MT-NN)	 |0.931        	     |0.647              |15              |60             |                                         
|           |           |           |graph convolution   |0.845        	     |0.646              |20              |60             |                                         
|toxcast    |617        |8615       |logistic regression |0.762        	     |0.622              |80              |2000           |                                         
|           |           |           |tensorflow(MT-NN)	 |0.926        	     |0.705              |80              |2400           |                                         
|           |           |           |graph convolution   |0.906        	     |0.725              |80              |3000           |                                         
Index splitting

|Dataset    |Model               |Train score/ROC-AUC|Valid score/ROC-AUC|
|-----------|--------------------|-------------------|-------------------|
|tox21      |logistic regression |0.903              |0.705              |
|           |tensorflow(MT-NN)   |0.856              |0.763              |
|           |robust MT-NN        |0.857              |0.767              |
|           |graph convolution   |0.872              |0.798              |
|muv        |logistic regression |0.963              |0.766              |
|           |tensorflow(MT-NN)   |0.904              |0.764              |
|           |robust MT-NN        |0.934              |0.781              |
|           |graph convolution   |0.840              |0.823              |
|pcba       |logistic regression |0.809              |0.776              |
|           |tensorflow(MT-NN)   |0.826              |0.802              |
|           |robust MT-NN        |0.809              |0.783              |
|           |graph convolution   |0.876              |0.852              |
|sider      |logistic regression |0.933              |0.620              |
|           |tensorflow(MT-NN)   |0.775              |0.634              |
|           |robust MT-NN        |0.803              |0.632              |
|           |graph convolution   |0.708              |0.594              |
|toxcast    |logistic regression |0.721              |0.575              |
|           |tensorflow(MT-NN)   |0.830              |0.678              |
|           |robust MT-NN        |0.825              |0.680              |
|           |graph convolution   |0.821              |0.720              |

Random splitting

|Dataset    |Model               |Train score/ROC-AUC|Valid score/ROC-AUC|
|-----------|--------------------|-------------------|-------------------|
|tox21      |logistic regression |0.903              |0.741              |
|           |tensorflow(MT-NN)   |0.846              |0.812              |
|           |robust MT-NN        |0.844              |0.793              |
|           |graph convolution   |0.872              |0.816              |
|muv        |logistic regression |0.961              |0.696              |
|           |tensorflow(MT-NN)   |0.895              |0.740              |
|           |robust MT-NN        |0.914              |0.667              |
|           |graph convolution   |0.846              |0.776              |
|pcba       |logistic regression |0.807        	     |0.772              |
|           |tensorflow(MT-NN)   |0.811        	     |0.787              |
|           |robust MT-NN        |0.809              |0.778              |
|           |graph convolution   |0.875       	     |0.844              |
|sider      |logistic regression |0.932        	     |0.628              |
|           |tensorflow(MT-NN)   |0.779        	     |0.665              |
|           |robust MT-NN        |0.761              |0.621              |
|           |graph convolution   |0.706        	     |0.638              |
|toxcast    |logistic regression |0.737        	     |0.543              |
|           |tensorflow(MT-NN)   |0.831        	     |0.684              |
|           |robust MT-NN        |0.814              |0.692              |
|           |graph convolution   |0.820        	     |0.692              |

Scaffold splitting

|Dataset    |Model               |Train score/ROC-AUC|Valid score/ROC-AUC|
|-----------|--------------------|-------------------|-------------------|
|tox21      |logistic regression |0.900              |0.650              |
|           |tensorflow(MT-NN)   |0.863              |0.703              |
|           |robust MT-NN        |0.861              |0.710              |
|           |graph convolution   |0.885              |0.732              |
|muv        |logistic regression |0.947              |0.767              |
|           |tensorflow(MT-NN)   |0.899              |0.762              |
|           |robust MT-NN        |0.944              |0.726              |
|           |graph convolution   |0.872              |0.795              |
|pcba       |logistic regression |0.810              |0.742              |
|           |tensorflow(MT-NN)   |0.814              |0.760              |
|           |robust MT-NN        |0.812              |0.756              |
|           |graph convolution   |0.874              |0.817              |
|sider      |logistic regression |0.926              |0.592              |
|           |tensorflow(MT-NN)   |0.776              |0.557              |
|           |robust MT-NN        |0.797              |0.560              |
|           |graph convolution   |0.722              |0.583              |
|toxcast    |logistic regression |0.716              |0.492              |
|           |tensorflow(MT-NN)   |0.828              |0.617              |
|           |robust MT-NN        |0.830              |0.614              |
|           |graph convolution   |0.832              |0.638              |

Number of tasks and examples in the datasets

|Dataset    |N(tasks)	|N(samples) |
|-----------|-----------|-----------| 
|tox21      |12         |8014       |
|muv        |17         |93127      |
|pcba       |128        |439863     |
|sider      |27         |1427       |
|toxcast    |617        |8615       |

Time needed for benchmark test(~20h in total)

|Dataset    |Model               |Time(loading)/s |Time(running)/s|
|-----------|--------------------|----------------|---------------| 
|tox21      |logistic regression |30              |60             |
|           |tensorflow(MT-NN)   |30              |60             |
|           |robust MT-NN        |30              |90             |
|           |graph convolution   |40              |160            |
|muv        |logistic regression |600             |450            |
|           |tensorflow(MT-NN)   |600             |400            |
|           |robust MT-NN        |600             |550            |
|           |graph convolution   |800             |1800           |
|pcba       |logistic regression |1800            |10000          |
|           |tensorflow(MT-NN)	 |1800            |9000           |
|           |robust MT-NN        |1800            |14000          |
|           |graph convolution   |2200            |14000          |
|sider      |logistic regression |15              |80             |
|           |tensorflow(MT-NN)	 |15              |75             |
|           |robust MT-NN        |15              |150            |
|           |graph convolution   |20              |50             |
|toxcast    |logistic regression |80              |2600           |
|           |tensorflow(MT-NN)   |80              |2300           |
|           |robust MT-NN        |80              |4000           |
|           |graph convolution   |80              |900            |


## Contributing to DeepChem
+93 −81
Original line number Diff line number Diff line
@@ -13,20 +13,9 @@ Giving performances of: Random forest(rf), MultitaskDNN(tf),
                    
on datasets: muv, nci, pcba, tox21, sider, toxcast

time estimation(on a nvidia tesla K20 GPU):
tox21   - dataloading: 30s
        - tf: 40s
muv     - dataloading: 400s
        - tf: 250s
pcba    - dataloading: 30min
        - tf: 2h
sider   - dataloading: 10s
        - tf: 60s
toxcast - dataloading: 70s
        - tf: 40min
(will include more)

Total time of running a benchmark test: 3~4h
time estimation listed in README file

Total time of running a benchmark test(for one splitting function): 20h
"""
from __future__ import print_function
from __future__ import division
@@ -39,6 +28,7 @@ import shutil
import time
import deepchem as dc
import tensorflow as tf
import argparse
from keras import backend as K

from sklearn.ensemble import RandomForestClassifier
@@ -51,8 +41,9 @@ from toxcast.toxcast_datasets import load_toxcast
from sider.sider_datasets import load_sider

def benchmark_loading_datasets(base_dir_o, hyper_parameters, 
                               dataset_name='all', model='tf', reload = True,
                               verbosity='high', out_path='/tmp'):
                               dataset='tox21', model='tf', split=None,
                               reload=True, verbosity='high', 
                               out_path='.'):
  """
  Loading dataset for benchmark test
  
@@ -64,18 +55,22 @@ def benchmark_loading_datasets(base_dir_o, hyper_parameters,
  hyper_parameters : dict of list
      hyper parameters including dropout rate, learning rate, etc.
  
  dataset_name : string, optional (default='all')
      choice of which dataset to use, 'all' = computing all the datasets
  dataset : string, optional (default='tox21')
      choice of which dataset to use, should be: tox21, muv, sider, 
      toxcast, pcba
      
  model : string,  optional (default='tf')
      choice of which model to use, should be: rf, tf, tf_robust, logreg,
      graphconv

  out_path : string, optional(default='/tmp')
  split : string,  optional (default=None)
      choice of splitter function, None = using the default splitter

  out_path : string, optional(default='.')
      path of result file
      
  """
  if not dataset_name in ['all','muv','nci','pcba','tox21','sider','toxcast']:
  if not dataset in ['muv','nci','pcba','tox21','sider','toxcast']:
    raise ValueError('Dataset not supported')
                          
  if model in ['graphconv']:
@@ -87,31 +82,31 @@ def benchmark_loading_datasets(base_dir_o, hyper_parameters,
  else:
    raise ValueError('Model not supported')

  if dataset_name == 'all':
    #currently not including the nci dataset
    dataset_name = ['tox21', 'muv', 'pcba', 'sider', 'toxcast']
  else:
    dataset_name = [dataset_name]
  if not split in [None, 'index','random','scaffold']:
    raise ValueError('Splitter function not supported')
  
  loading_functions = {'tox21': load_tox21, 'muv': load_muv,
                       'pcba': load_pcba, 'nci': load_nci,
                       'sider': load_sider, 'toxcast': load_toxcast}
  
  for dname in dataset_name:
  print('-------------------------------------')
    print('Benchmark %s on dataset: %s' % (model, dname))
  print('Benchmark %s on dataset: %s' % (model, dataset))
  print('-------------------------------------')
    base_dir = os.path.join(base_dir_o, dname)
    
  base_dir = os.path.join(base_dir_o, dataset)
  time_start = time.time()
  #loading datasets
    tasks,datasets,transformers = loading_functions[dname](
  if split is not None:
    print('Splitting function: %s' % split)  
    tasks,all_dataset,transformers = loading_functions[dataset](
        featurizer=featurizer, split=split)
  else:
    tasks,all_dataset,transformers = loading_functions[dataset](
        featurizer=featurizer)
    train_dataset, valid_dataset, test_dataset = datasets
  
  train_dataset, valid_dataset, test_dataset = all_dataset
  time_finish_loading = time.time()
  #time_finish_loading-time_start is the time(s) used for dataset loading
    

  #running model
  for count, hp in enumerate(hyper_parameters[model]):
    time_start_fitting = time.time()
@@ -123,7 +118,7 @@ def benchmark_loading_datasets(base_dir_o, hyper_parameters,
    
    with open(os.path.join(out_path, 'results.csv'),'a') as f:
      f.write('\n'+str(count)+',')
        f.write(dname+',train,')
      f.write(dataset+','+split+',train,')
      for i in train_score:
        f.write(i+','+str(train_score[i]['mean-roc_auc_score'])+',')
      f.write('valid,')
@@ -131,12 +126,9 @@ def benchmark_loading_datasets(base_dir_o, hyper_parameters,
        f.write(i+','+str(valid_score[i]['mean-roc_auc_score'])+',')
      f.write('time_for_running,'+
            str(time_finish_fitting-time_start_fitting)+',')

  #clear workspace         
    del tasks,datasets,transformers
  del tasks, all_dataset, transformers
  del train_dataset, valid_dataset, test_dataset
    del time_start,time_finish_loading,time_start_fitting,time_finish_fitting

  return None

def benchmark_train_and_valid(base_dir, train_dataset, valid_dataset, tasks,
@@ -374,37 +366,57 @@ if __name__ == '__main__':
    shutil.rmtree(base_dir_o)
  os.makedirs(base_dir_o)
  
  #Datasets and models used in the benchmark test, all=all the datasets
  dataset_name = 'tox21'
  model = 'tf'
  parser = argparse.ArgumentParser(description='Deepchem benchmark: '+
      'giving performances of different learning models on datasets')
  parser.add_argument('-s', action='append', dest='splitter_args', default=[],
      help='Choice of splitting function: index, random, scaffold')
  parser.add_argument('-m', action='append', dest='model_args', default=[], 
      help='Choice of model: tf, tf_robust, logreg, graphconv')
  parser.add_argument('-d', action='append', dest='dataset_args', default=[], 
      help='Choice of dataset: tox21, sider, muv, toxcast, pcba')
  args = parser.parse_args()
  #Datasets and models used in the benchmark test
  splitters = args.splitter_args
  models = args.model_args
  datasets = args.dataset_args

  if len(splitters) == 0:
    splitters = ['index', 'random', 'scaffold']
  if len(models) == 0:
    models = ['tf', 'tf_robust', 'logreg', 'graphconv']
  if len(datasets) == 0:
    datasets = ['tox21', 'sider', 'muv', 'toxcast', 'pcba']

  #input hyperparameters
  #tf: dropouts, learning rate, layer_sizes, weight initial stddev,penalty,
  #    batch_size
  hps = {}
  hps = {}
  hps['tf'] = [{'layer_sizes': [500], 'weight_init_stddevs': [0.02], 
                'bias_init_consts': [1.], 'dropouts': [0.5], 'penalty': 0, 
  hps['tf'] = [{'layer_sizes': [1500], 'weight_init_stddevs': [0.02], 
                'bias_init_consts': [1.], 'dropouts': [0.5], 'penalty': 0.1, 
                'penalty_type': 'l2', 'batch_size': 50, 'nb_epoch': 10, 
                'learning_rate': 0.001}]

  hps['tf_robust'] = [{'layer_sizes': [500], 'weight_init_stddevs': [0.02], 
  hps['tf_robust'] = [{'layer_sizes': [1500], 'weight_init_stddevs': [0.02], 
                       'bias_init_consts': [1.], 'dropouts': [0.5], 
                       'bypass_layer_sizes': [100], 
                       'bypass_layer_sizes': [200], 
                       'bypass_weight_init_stddevs': [0.02],
                       'bypass_bias_init_consts': [1.], 
                       'bypass_dropouts': [0.5], 'penalty': 0,
                       'bypass_dropouts': [0.5], 'penalty': 0.1,
                       'penalty_type': 'l2', 'batch_size': 50, 
                       'nb_epoch': 10, 'learning_rate': 0.001}]
                       'nb_epoch': 10, 'learning_rate': 0.0005}]
             
  hps['logreg'] = [{'penalty': 0, 'penalty_type': 'l2', 'batch_size': 50, 
                    'nb_epoch': 10, 'learning_rate': 0.001}]
  hps['logreg'] = [{'penalty': 0.1, 'penalty_type': 'l2', 'batch_size': 50, 
                    'nb_epoch': 10, 'learning_rate': 0.005}]
                
  hps['graphconv'] = [{'batch_size': 50, 'nb_epoch': 10, 
                       'learning_rate': 0.001, 'n_filters': 64, 
                       'n_fully_connected_nodes': 128}]
                       'learning_rate': 0.0005, 'n_filters': 64, 
                       'n_fully_connected_nodes': 128, 'seed': 123}]

  hps['rf'] = [{'n_estimators': 500}]
         
  benchmark_loading_datasets(base_dir_o, hps, dataset_name=dataset_name,
                             model=model, reload=reload, verbosity='high')
  for split in splitters:
    for model in models:
      for dataset in datasets:
        benchmark_loading_datasets(base_dir_o, hps, dataset=dataset, model=model, 
                                   split=split, verbosity='high', out_path='.')
+5 −2
Original line number Diff line number Diff line
@@ -10,7 +10,7 @@ import numpy as np
import shutil
import deepchem as dc

def load_muv(featurizer='ECFP'):
def load_muv(featurizer='ECFP', split='index'):
  """Load MUV datasets. Does not do train/test split"""
  # Load MUV dataset
  print("About to load MUV dataset.")
@@ -42,7 +42,10 @@ def load_muv(featurizer='ECFP'):
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  splitter = dc.splits.IndexSplitter()
  splitters = {'index': dc.splits.IndexSplitter(),
               'random': dc.splits.RandomSplitter(),
               'scaffold': dc.splits.ScaffoldSplitter()}
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(
	dataset, compute_feature_statistics=False)
  return MUV_tasks, (train, valid, test), transformers
+6 −2
Original line number Diff line number Diff line
@@ -13,7 +13,8 @@ import numpy as np
import shutil
import deepchem as dc

def load_nci(featurizer='ECFP', shard_size=1000, num_shards_per_batch=4):
def load_nci(featurizer='ECFP', shard_size=1000, 
             num_shards_per_batch=4, split='random'):

  current_dir = os.path.dirname(os.path.realpath(__file__))

@@ -62,7 +63,10 @@ def load_nci(featurizer='ECFP', shard_size=1000, num_shards_per_batch=4):
  for transformer in transformers:
    dataset = transformer.transform(dataset)
  
  splitter = dc.splits.RandomSplitter()
  splitters = {'index': dc.splits.IndexSplitter(),
               'random': dc.splits.RandomSplitter(),
               'scaffold': dc.splits.ScaffoldSplitter()}
  splitter = splitters[split]
  print("Performing new split.")
  train, valid, test = splitter.train_valid_test_split(dataset,
	compute_feature_statistics=False)
+6 −3
Original line number Diff line number Diff line
@@ -10,7 +10,7 @@ import numpy as np
import shutil
import deepchem as dc

def load_pcba(featurizer='ECFP'):
def load_pcba(featurizer='ECFP', split='random'):
  """Load PCBA datasets. Does not do train/test split"""
  
  current_dir = os.path.dirname(os.path.realpath(__file__))
@@ -63,7 +63,10 @@ def load_pcba(featurizer='ECFP'):
  for transformer in transformers:
    dataset = transformer.transform(dataset)
  
  splitter = dc.splits.RandomSplitter()
  splitters = {'index': dc.splits.IndexSplitter(),
               'random': dc.splits.RandomSplitter(),
               'scaffold': dc.splits.ScaffoldSplitter()}
  splitter = splitters[split]
  print("Performing new split.")
  train, valid, test = splitter.train_valid_test_split(
	dataset, compute_feature_statistics=False)
Loading