Commit b667a504 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #282 from miaecle/Benchmark2

Benchmark modified (deepchem to dc)
parents 724a4fac 45766159
Loading
Loading
Loading
Loading
+4 −1
Original line number Diff line number Diff line
@@ -208,6 +208,9 @@ different subclasses of ``Featurizer`` for convenience:
|tox21      |tensorflow(MT-DNN)  |0.987              |0.800              |35              |36             |
|muv        |tensorflow(MT-DNN)  |0.979              |0.660              |414             |255            |
|pcba       |tensorflow(MT-DNN)	 |0.949        	     |0.791              |1765            |7209           |                                         
|sider      |tensorflow(MT-DNN)	 |0.864        	     |0.627              |10              |63             |                                         
|toxcast    |tensorflow(MT-DNN)	 |0.944        	     |0.697              |75              |2374           |                                         

## Contributing to DeepChem

We actively encourage community contributions to DeepChem. The first place to start getting involved is by running our examples locally. Afterwards, we encourage contributors to give a shot to improving our documentation. While we take effort to provide good docs, there's plenty of room for improvement. All docs are hosted on Github, either in this `README.md` file, or in the `docs/` directory.
+112 −215
Original line number Diff line number Diff line
@@ -16,7 +16,13 @@ muv - dataloading: 400s
      - tf: 250s
pcba  - dataloading: 30min
      - tf: 2h
sider - dataloading: 10s
      - tf: 60s
toxcast dataloading: 70s
	tf: 40min
(will include more)

Total time of running a benchmark test: 3~4h
"""
from __future__ import print_function
from __future__ import division
@@ -27,28 +33,19 @@ import os
import numpy as np
import shutil
import time
import deepchem as dc

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from deepchem.data import Dataset
from deepchem import metrics
from deepchem.metrics import Metric
from deepchem.utils.evaluate import Evaluator
from deepchem.models.keras_models.fcnet import MultiTaskDNN
from deepchem.models.keras_models import KerasModel
from deepchem.models.multitask import SingletaskToMultitask
from deepchem.models.sklearn_models import SklearnModel
from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskClassifier
from deepchem.models.tensorflow_models import TensorflowModel
from deepchem.splits import RandomSplitter

from muv.muv_datasets import load_muv
from nci.nci_datasets import load_nci
from pcba.pcba_datasets import load_pcba
from tox21.tox21_datasets import load_tox21
from toxcast.toxcast_datasets import load_toxcast
from sider.sider_datasets import load_sider

def benchmark_loading_datasets(base_dir_o, n_features = 1024, 
def benchmark_loading_datasets(base_dir_o, hyper_parameters, n_features = 1024, 
                               dataset_name='all',model='all',reload = True,
                               verbosity='high',out_path='/tmp'):
  """
@@ -56,260 +53,159 @@ def benchmark_loading_datasets(base_dir_o, n_features = 1024,
  
  Parameters
  ----------
  base_dir_o, string
  base_dir_o : string
      path of working folder, will be combined with '/dataset_name'
  
  n_features, integer, optional (default=1024)
  hyper_parameters : dict of list
      hyper parameters including dropout rate, learning rate, etc.
 
  n_features : integer, optional (default=1024)
      number of features, or length of binary fingerprints
  
  dataset_name, string, optional (default='all')
  dataset_name : string, optional (default='all')
      choice of which dataset to use, 'all' = computing all the datasets
      
  model string, optional (default='all')
  model : string,  optional (default='all')
      choice of which model to use, 'all' = running all models on the dataset
  
  out_path, string, optional(default='/tmp')
  out_path : string, optional(default='/tmp')
      path of result file
      
  """
  assert dataset_name in ['all', 'muv', 'nci', 'pcba', 'tox21']
  assert dataset_name in ['all', 'muv', 'nci', 'pcba', 'tox21','sider',
                          'toxcast']
  
  if dataset_name == 'all':
    #currently not including the nci dataset
    dataset_name = ['muv','pcba','tox21']
    dataset_name = ['tox21','muv','pcba','sider','toxcast']
  else:
    dataset_name = [dataset_name]
  
  if 'tox21' in dataset_name:
  loading_functions = {'tox21':load_tox21, 'muv':load_muv,
                       'pcba':load_pcba, 'nci':load_nci,
                       'sider':load_sider, 'toxcast':load_toxcast}
  
  for dname in dataset_name:
    print('-------------------------------------')
    print('Benchmark test on dataset: tox21')
    print('Benchmark test on dataset: '+dname)
    print('-------------------------------------')
    base_dir = os.path.join(base_dir_o, "tox21")
    base_dir = os.path.join(base_dir_o, dname)
    
    time_start = time.time()
    #loading datasets for tox21
    tasks_tox21,datasets_tox21,transformers_tox21 = load_tox21(base_dir,
                                                               reload=reload)
    #loading datasets     
    tasks,datasets,transformers = loading_functions[dname]()
    train_dataset, valid_dataset, test_dataset = datasets
    time_finish_loading = time.time()
    #time_finish_loading-time_start is the time(s) used for dataset loading
    
    #dataset splitting, built-in method in load_tox21
    train_dataset, valid_dataset = datasets_tox21

    #running model
    tox21_train,tox21_valid = benchmark_train_and_valid(base_dir,train_dataset,
                                                        valid_dataset,
                                                        tasks_tox21,
							transformers_tox21,
                                              		n_features, model,
							verbosity)
    train_score,valid_score = benchmark_train_and_valid(base_dir,train_dataset,
                                                        valid_dataset, tasks,
                                                        transformers,
                                                        hyper_parameters,
                                                        n_features=n_features,
                                                        model = model,
                                                        verbosity = verbosity)
    time_finish_running = time.time()
    #time_finish_running-time_finish_loading is the time(s) used for fitting and evaluating
        
    with open(os.path.join(out_path,'results.csv'),'a') as f:
      f.write ('\n'+'tox21,train')
      for i in tox21_train:
        f.write(','+i+','+str(tox21_train[i])) #output train score
      f.write('\n'+'tox21,valid')
      for i in tox21_valid:
        f.write(','+i+','+str(tox21_valid[i])) #output valid score
      f.write('\n'+dname+',train')
      for i in train_score:
        f.write(','+i+','+str(train_score[i]['mean-roc_auc_score']))
      f.write('\n'+dname+',valid')
      for i in valid_score:
        f.write(','+i+','+str(valid_score[i]['mean-roc_auc_score'])) 
      #output timing data: running time include all the model
      f.write('\n'+'tox21,time_for_loading,'+
      f.write('\n'+dname+',time_for_loading,,'+
              str(time_finish_loading-time_start)+'seconds')
      f.write('\n'+'tox21,time_for_running,'+
      f.write('\n'+dname+',time_for_running,,'+
              str(time_finish_running-time_finish_loading)+'seconds')
    
    #clear workspace         
    del tasks_tox21,datasets_tox21,transformers_tox21
    del train_dataset,valid_dataset
    del time_start,time_finish_loading,time_finish_running

  if 'muv' in dataset_name:
    print('-------------------------------------')
    print('Benchmark test on dataset: muv')
    print('-------------------------------------')
    base_dir = os.path.join(base_dir_o, "muv")
    time_start = time.time()
    #loading datasets for muv
    tasks_muv,datasets_muv,transformers_muv = load_muv(base_dir,reload=reload)
    time_finish_loading = time.time()    
    
    #dataset splitting, built-in method in load_tox21
    train_dataset, valid_dataset = datasets_muv
    #running model
    muv_train,muv_valid = benchmark_train_and_valid(base_dir,train_dataset,
                                                    valid_dataset,
                                                    tasks_muv,transformers_muv,
                                                    n_features,model,verbosity)
    time_finish_running = time.time()
    
    with open(os.path.join(out_path,'results.csv'),'a') as f:
      f.write ('\n'+'muv,train')
      for i in muv_train:
        f.write(','+i+','+str(muv_train[i]))
      f.write('\n'+'muv,valid')
      for i in muv_valid:
        f.write(','+i+','+str(muv_valid[i]))
      f.write('\n'+'muv,time_for_loading,'+
              str(time_finish_loading-time_start)+'seconds')
      f.write('\n'+'muv,time_for_running,'+
              str(time_finish_running-time_finish_loading)+'seconds')
      
    del tasks_muv, datasets_muv, transformers_muv
    del train_dataset,valid_dataset
    del time_start,time_finish_loading,time_finish_running

  if 'pcba' in dataset_name:
    print('-------------------------------------')
    print('Benchmark test on dataset: pcba')
    print('-------------------------------------')
    base_dir = os.path.join(base_dir_o, "pcba")
    train_dir = os.path.join(base_dir, "train_dataset")
    valid_dir = os.path.join(base_dir, "valid_dataset")
    test_dir = os.path.join(base_dir, "test_dataset")

    time_start = time.time()
    #loading datasets for pcba
    tasks_pcba,datasets_pcba,transformers_pcba = load_pcba(base_dir,
                                                           reload=reload)
    time_finish_loading = time.time()
   
    #dataset splitting, RandomSplitter function
    print("About to perform train/valid/test split.")
    splitter = RandomSplitter(verbosity=verbosity)
    print("Performing new split.")
    train_dataset,valid_dataset,test_dataset = splitter.train_valid_test_split(
                                datasets_pcba, train_dir, valid_dir, test_dir)
    #running model
    pcba_train,pcba_valid = benchmark_train_and_valid(base_dir,train_dataset,
                            	                      valid_dataset,
                                                      tasks_pcba,
						      transformers_pcba,
                                                      n_features, model,
						      verbosity)
    time_finish_running = time.time()

    with open(os.path.join(out_path,'results.csv'),'a') as f:
      f.write ('\n'+'pcba,train')
      for i in pcba_train:
        f.write(','+i+','+str(pcba_train[i]))
      f.write('\n'+'pcba,valid')
      for i in pcba_valid:
        f.write(','+i+','+str(pcba_valid[i]))
      f.write('\n'+'pcba,time_for_loading,'+
              str(time_finish_loading-time_start)+'seconds')
      f.write('\n'+'pcba,time_for_running,'+
              str(time_finish_running-time_finish_loading)+'seconds')
     
    del tasks_pcba, datasets_pcba, transformers_pcba
    del train_dataset,valid_dataset
    del time_start,time_finish_loading,time_finish_running

  if 'nci' in dataset_name:
    print('-------------------------------------')
    print('Benchmark test on dataset: nci')
    print('-------------------------------------')
    base_dir = os.path.join(base_dir_o,  "nci")
    train_dir = os.path.join(base_dir, "train_dataset")
    valid_dir = os.path.join(base_dir, "valid_dataset")
    test_dir = os.path.join(base_dir, "test_dataset")

    time_start = time.time()
    #loading datasets for nci
    tasks_nci,datasets_nci,transformers_nci = load_nci(base_dir, reload=reload)
    time_finish_loading = time.time()
    
    #dataset splitting, RandomSplitter function
    print("About to perform train/valid/test split.")
    splitter = RandomSplitter(verbosity=verbosity)
    print("Performing new split.")
    train_dataset,valid_dataset,test_dataset = splitter.train_valid_test_split(
                                datasets_nci, train_dir, valid_dir, test_dir)
    #running model
    nci_train,nci_valid = benchmark_train_and_valid(base_dir,train_dataset,
                                      	            valid_dataset,
                                                    tasks_nci,transformers_nci,
                                                    n_features,model,verbosity)
    time_finish_running = time.time()
    
    with open(os.path.join(out_path,'results.csv'),'a') as f:
      f.write ('\n'+'nci,train')
      for i in nci_train:
        f.write(','+i+','+str(nci_train[i]))
      f.write('\n'+'nci,valid')
      for i in nci_valid:
        f.write(','+i+','+str(nci_valid[i]))
      f.write('\n'+'nci,time_for_loading,'+
              str(time_finish_loading-time_start)+'seconds')
      f.write('\n'+'nci,time_for_running,'+
              str(time_finish_running-time_finish_loading)+'seconds')

    del tasks_nci, datasets_nci, transformers_nci
    del train_dataset,valid_dataset
    del tasks,datasets,transformers
    del train_dataset,valid_dataset, test_dataset
    del time_start,time_finish_loading,time_finish_running

  return None

def benchmark_train_and_valid(base_dir,train_dataset,valid_dataset,tasks,
                              transformers,n_features = 1024,model = 'all',
                              transformers, hyper_parameters,
                              n_features = 1024,model = 'all',
                              verbosity = 'high'):
  """
  Calculate performance of different models on the specific dataset & tasks
  
  Parameters
  ----------
  base_dir, string
  base_dir : string
      path of working folder
      
  train_dataset, dataset struct
  train_dataset : dataset struct
      loaded dataset using load_* or splitter function
      
  valid_dataset, dataset struct
  valid_dataset : dataset struct
      loaded dataset using load_* or splitter function
  
  tasks, list of string
  tasks : list of string
      list of targets(tasks, datasets)
  
  transformers, BalancingTransformer struct
  transformers : BalancingTransformer struct
      loaded properties of dataset from load_* function
  
  n_features, integer, optional (default=1024)
  hyper_parameters : dict of list
      hyper parameters including dropout rate, learning rate, etc.
 
  n_features : integer, optional (default=1024)
      number of features, or length of binary fingerprints
  
  model, string, optional (default='all')
  model : string, optional (default='all')
      choice of which model to use, 'all' = running all models on the dataset
  

  Returns
  -------
  train_scores : dict
	predicting results(AUC, R2) on training set
  valid_scores : dict
	predicting results(AUC, R2) on valid set

  """
  train_scores = {}
  valid_scores = {}
  
  # Initialize metrics
  classification_metric = Metric(metrics.roc_auc_score, np.mean,
  classification_metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean,
                                            verbosity=verbosity,
                                            mode="classification")
  
  assert model in ['all', 'tf', 'rf', 'keras']
  assert model in ['all', 'tf', 'rf']

  if model == 'all' or model == 'tf':
    # Initialize model folder
    model_dir_tf = os.path.join(base_dir, "model_tf")
    
    dropouts = hyper_parameters['tf'][0]
    learning_rate = hyper_parameters['tf'][1]
    weight_init_stddevs = hyper_parameters['tf'][2]
    batch_size = hyper_parameters['tf'][3]
    # Building tensorflow MultiTaskDNN model
    tensorflow_model = TensorflowMultiTaskClassifier(
        len(tasks), n_features, model_dir_tf, dropouts=[.25],
        learning_rate=0.001, weight_init_stddevs=[.1],
        batch_size=64, verbosity=verbosity)
    model_tf = TensorflowModel(tensorflow_model, model_dir_tf)
    tensorflow_model = dc.models.TensorflowMultiTaskClassifier(
        len(tasks), n_features, dropouts=[dropouts],
        learning_rate=learning_rate, weight_init_stddevs=[weight_init_stddevs],
        batch_size=batch_size, verbosity=verbosity)
    model_tf = dc.models.TensorflowModel(tensorflow_model)
 
    print('-------------------------------------')
    print('Start fitting by tensorflow')
    model_tf.fit(train_dataset)
    train_evaluator = Evaluator(model_tf, train_dataset, transformers,
                                verbosity=verbosity)
    train_scores['tensorflow'] = train_evaluator.compute_model_performance(
                                [classification_metric])['mean-roc_auc_score']
    valid_evaluator = Evaluator(model_tf, valid_dataset, transformers,
                                verbosity=verbosity)
    valid_scores['tensorflow'] = valid_evaluator.compute_model_performance(
                                [classification_metric])['mean-roc_auc_score']

    train_scores['tensorflow'] = model_tf.evaluate(train_dataset,
                                    [classification_metric],transformers)

    valid_scores['tensorflow'] = model_tf.evaluate(valid_dataset,
                                    [classification_metric],transformers)

  
  if model == 'all' or model == 'rf':
@@ -320,39 +216,40 @@ def benchmark_train_and_valid(base_dir,train_dataset,valid_dataset,tasks,
    def model_builder(model_dir_rf):
      sklearn_model = RandomForestClassifier(
        class_weight="balanced", n_estimators=500,n_jobs=-1)
      return SklearnModel(sklearn_model, model_dir_rf)
    model_rf = SingletaskToMultitask(tasks, model_builder, model_dir_rf)
      return dc.models.sklearn_models.SklearnModel(sklearn_model, model_dir_rf)
    model_rf = dc.models.multitask.SingletaskToMultitask(
		tasks, model_builder, model_dir_rf)
    
    print('-------------------------------------')
    print('Start fitting by random forest')
    model_rf.fit(train_dataset)
    train_evaluator = Evaluator(model_rf, train_dataset, transformers, 
                                verbosity=verbosity)
    train_scores['random_forest'] = train_evaluator.compute_model_performance(
                                [classification_metric])['mean-roc_auc_score']
    valid_evaluator = Evaluator(model_rf, valid_dataset, transformers, 
                                verbosity=verbosity)
    valid_scores['random_forest'] = valid_evaluator.compute_model_performance(
                                [classification_metric])['mean-roc_auc_score']
    train_scores['random_forest'] = model_rf.evaluate(train_dataset,
                                    [classification_metric],transformers)

    valid_scores['random_forest'] = model_rf.evaluate(valid_dataset,
                                    [classification_metric],transformers)

  return train_scores, valid_scores

if __name__ == '__main__':
  # Global variables
  np.random.seed(123)
  reload = True
  verbosity = 'high'
  
  #Working folder initialization
  base_dir = "/tmp/benchmark_test_"+time.strftime("%Y_%m_%d", time.localtime())
  if os.path.exists(base_dir):
    shutil.rmtree(base_dir)
  os.makedirs(base_dir)
  base_dir_o="/tmp/benchmark_test_"+time.strftime("%Y_%m_%d", time.localtime())
  if os.path.exists(base_dir_o):
    shutil.rmtree(base_dir_o)
  os.makedirs(base_dir_o)
  
  #Datasets and models used in the benchmark test, all=all the datasets(models)
  dataset_name = sys.argv[1]
  model = sys.argv[2]

  benchmark_loading_datasets(base_dir, n_features = 1024,
  #input hyperparameters
  #tf: dropouts, learning rate, weight initial stddev, batch_size
  hyper_parameters = {'tf':[0.25, 0.0003, 0.1, 50]}

  benchmark_loading_datasets(base_dir_o,hyper_parameters,n_features = 1024,
                             dataset_name = dataset_name, model = model,
                             reload = reload, verbosity = verbosity)
+66 −89
Original line number Diff line number Diff line
@@ -11,32 +11,11 @@ import os
import sys
import numpy as np
import shutil
from deepchem.utils.save import load_sharded_csv
from deepchem.data import Dataset
from deepchem.featurizers.featurize import DataLoader
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.trans import NormalizationTransformer
import deepchem as dc

def load_nci(base_dir, reload=True, force_transform=False,
             shard_size=1000, num_shards_per_batch=4):
  """Load NCI datasets. Does not do train/test split"""
  # Set some global variables up top
  verbosity = "high"
  model = "logistic"
  regen = False
def load_nci(shard_size=1000, num_shards_per_batch=4):

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      print("Deleting dir in nci_datasets.py")
      print(base_dir)
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")

  # Load nci dataset
  print("About to load NCI dataset.")
@@ -44,15 +23,14 @@ def load_nci(base_dir, reload=True, force_transform=False,
      current_dir, "../../datasets/nci_1.csv.gz")
  dataset_file2_path = os.path.join(
      current_dir, "../../datasets/nci_2.csv.gz")

  dataset_paths = [dataset_file1_path, dataset_file2_path]
  dataset = load_sharded_csv(dataset_paths)
  print("Columns of dataset: %s" % str(dataset.columns.values))
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))


  # Featurize nci dataset
  print("About to featurize nci dataset.")
  featurizer = CircularFingerprint(size=1024)
  #was sorted list originally in muv_datasets.py, but csv is ordered so removed
  featurizer = dc.feat.CircularFingerprint(size=1024)

  all_nci_tasks = (['CCRF-CEM', 'HL-60(TB)', 'K-562', 'MOLT-4', 'RPMI-8226',
                    'SR', 'A549/ATCC', 'EKVX', 'HOP-62', 'HOP-92', 'NCI-H226',
                    'NCI-H23', 'NCI-H322M', 'NCI-H460', 'NCI-H522', 'COLO 205',
@@ -66,24 +44,23 @@ def load_nci(base_dir, reload=True, force_transform=False,
                    'MDA-MB-231/ATCC', 'MDA-MB-468', 'HS 578T', 'BT-549',
                    'T-47D'])

  loader = DataLoader(tasks=all_nci_tasks,
  loader = dc.load.DataLoader(tasks=all_nci_tasks,
                     	      smiles_field="smiles",
	                      featurizer=featurizer,
                      verbosity=verbosity)
  if not reload or not os.path.exists(data_dir):
    dataset = loader.featurize(dataset_paths, data_dir, shard_size=shard_size,
        	              verbosity='high')

  dataset = loader.featurize(dataset_paths, shard_size=shard_size,
                             num_shards_per_batch=num_shards_per_batch)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)

  # Initialize transformers
  transformers = []
  if regen or force_transform:
  print("About to transform data")
  transformers = [
        NormalizationTransformer(transform_y=True, dataset=dataset)]
      dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)]
  for transformer in transformers:
    dataset = transformer.transform(dataset)
  
  return all_nci_tasks, dataset, transformers
  splitter = dc.splits.RandomSplitter()
  print("Performing new split.")
  train, valid, test = splitter.train_valid_test_split(dataset)

  return all_nci_tasks, (train, valid, test), transformers
+67 −94

File changed.

Preview size limit exceeded, changes collapsed.

+0 −0

Empty file added.

Loading