Commit 428c18d7 authored by ZHENQIN WU's avatar ZHENQIN WU
Browse files

benchmark style change

parent d9e5318d
Loading
Loading
Loading
Loading
+186 −45
Original line number Diff line number Diff line
@@ -10,14 +10,15 @@ Benchmark test:
Giving classification performances of: 
    Random forest(rf), MultitaskDNN(tf), 
    RobustMultitaskDNN(tf_robust),
    Logistic regression(logreg),
    Logistic regression(logreg), IRV(irv)
    Graph convolution(graphconv)                 
on datasets: muv, pcba, tox21, sider, toxcast
on datasets: muv, pcba, tox21, sider, toxcast, clintox

Giving regression performances of:
    MultitaskDNN(tf_regression),
    Graph convolution regression(graphconvreg)
on datasets: delaney, nci, kaggle, pdbbind, chembl
on datasets: delaney(ESOL), nci, kaggle, pdbbind, gdb7, 
             chembl, sampl(FreeSolv)

time estimation listed in README file

@@ -37,7 +38,6 @@ import tensorflow as tf
import argparse
from keras import backend as K
import csv

from sklearn.ensemble import RandomForestClassifier

from muv.muv_datasets import load_muv
@@ -48,10 +48,9 @@ from toxcast.toxcast_datasets import load_toxcast
from sider.sider_datasets import load_sider
from kaggle.kaggle_datasets import load_kaggle
from delaney.delaney_datasets import load_delaney
from nci.nci_datasets import load_nci
from pdbbind.pdbbind_datasets import load_pdbbind_grid
from chembl.chembl_datasets import load_chembl
from qm7.qm7_datasets import load_qm7
from gdb7.gdb7_datasets import load_gdb7
from sampl.sampl_datasets import load_sampl
from clintox.clintox_datasets import load_clintox

@@ -61,20 +60,22 @@ def benchmark_loading_datasets(hyper_parameters,
                               model='tf',
                               split=None,
                               reload=True,
                               out_path='.'):
                               out_path='.',
                               test=False):
  """
  Loading dataset for benchmark test
  
  Parameters
  ----------
  hyper_parameters: dict of list
      hyper parameters including dropout rate, learning rate, etc.
      hyper parameters including layer size, dropout, learning rate, etc.
  dataset: string, optional (default='tox21')
      choice of which dataset to use, should be: tox21, muv, sider, 
      toxcast, pcba, delaney, kaggle, nci, clintox
      toxcast, pcba, delaney, kaggle, nci, clintox, pdbbind, chembl,
      gdb7, sampl
  model: string,  optional (default='tf')
      choice of which model to use, should be: rf, tf, tf_robust, logreg,
      graphconv, tf_regression, graphconvreg
      irv, graphconv, tf_regression, graphconvreg
  split: string,  optional (default=None)
      choice of splitter function, None = using the default splitter
  out_path: string, optional(default='.')
@@ -83,7 +84,9 @@ def benchmark_loading_datasets(hyper_parameters,

  if dataset in ['muv', 'pcba', 'tox21', 'sider', 'toxcast', 'clintox']:
    mode = 'classification'
  elif dataset in ['kaggle', 'delaney', 'nci', 'pdbbind', 'chembl', 'sampl']:
  elif dataset in [
      'kaggle', 'delaney', 'nci', 'pdbbind', 'chembl', 'gdb7', 'sampl'
  ]:
    mode = 'regression'
  else:
    raise ValueError('Dataset not supported')
@@ -92,7 +95,7 @@ def benchmark_loading_datasets(hyper_parameters,
  if model in ['graphconv', 'graphconvreg']:
    featurizer = 'GraphConv'
    n_features = 75
  elif model in ['tf', 'tf_robust', 'logreg', 'rf', 'tf_regression']:
  elif model in ['tf', 'tf_robust', 'logreg', 'rf', 'irv', 'tf_regression']:
    featurizer = 'ECFP'
    n_features = 1024
  else:
@@ -115,12 +118,21 @@ def benchmark_loading_datasets(hyper_parameters,
    if not model in ['tf_regression']:
      return

  if dataset in ['gdb7']:
    featurizer = None
    if split in ['scaffold'
                ]:  # gdb7 supports index, random and indice splitting
      return
    if not model in ['tf_regression']:
      return

  if split in ['year']:
    if not dataset in ['chembl']:
      return
  elif split in ['indice']:
    if not dataset in ['gdb7']:
      return
  elif not split in [None, 'index', 'random', 'scaffold', 'butina']:
  elif not split in [None, 'index', 'random', 'scaffold']:
    raise ValueError('Splitter function not supported')

  loading_functions = {
@@ -134,6 +146,7 @@ def benchmark_loading_datasets(hyper_parameters,
      'delaney': load_delaney,
      'pdbbind': load_pdbbind_grid,
      'chembl': load_chembl,
      'gdb7': load_gdb7,
      'sampl': load_sampl,
      'clintox': load_clintox
  }
@@ -154,7 +167,7 @@ def benchmark_loading_datasets(hyper_parameters,
  train_dataset, valid_dataset, test_dataset = all_dataset
  time_finish_loading = time.time()
  # time_finish_loading-time_start is the time(s) used for dataset loading
  if dataset in ['kaggle', 'pdbbind']:
  if dataset in ['kaggle', 'pdbbind', 'gdb7']:
    n_features = train_dataset.get_data_shape()[0]
    # dataset has customized features

@@ -163,26 +176,30 @@ def benchmark_loading_datasets(hyper_parameters,
    time_start_fitting = time.time()
    if mode == 'classification':
      metric = 'auc'
      train_score, valid_score = benchmark_classification(
      train_score, valid_score, test_score = benchmark_classification(
          train_dataset,
          valid_dataset,
          test_dataset,
          tasks,
          transformers,
          hp,
          n_features,
          metric=metric,
          model=model)
          model=model,
          test=test)
    elif mode == 'regression':
      metric = 'r2'
      train_score, valid_score = benchmark_regression(
      train_score, valid_score, test_score = benchmark_regression(
          train_dataset,
          valid_dataset,
          test_dataset,
          tasks,
          transformers,
          hp,
          n_features,
          metric=metric,
          model=model)
          model=model,
          test=test)
    time_finish_fitting = time.time()

    with open(os.path.join(out_path, 'results.csv'), 'a') as f:
@@ -192,9 +209,12 @@ def benchmark_loading_datasets(hyper_parameters,
          output_line = [
              count, dataset, str(split), mode, 'train', i,
              train_score[i]['mean-roc_auc_score'], 'valid', i,
              valid_score[i]['mean-roc_auc_score'], 'time_for_running',
              time_finish_fitting - time_start_fitting
              valid_score[i]['mean-roc_auc_score']
          ]
          if test:
            output_line.extend(['test', i, test_score[i]['mean-roc_auc_score']])
          output_line.extend(
              ['time_for_running', time_finish_fitting - time_start_fitting])
          writer.writerow(output_line)
      else:
        for i in train_score:
@@ -202,29 +222,39 @@ def benchmark_loading_datasets(hyper_parameters,
            output_line = [
                count, dataset, str(split), mode, 'train', i,
                train_score[i]['mean-pearson_r2_score'], 'valid', i,
                valid_score[i]['mean-pearson_r2_score'], 'time_for_running',
                time_finish_fitting - time_start_fitting
                valid_score[i]['mean-pearson_r2_score']
            ]
            if test:
              output_line.extend(
                  ['test', i, test_score[i]['mean-pearson_r2_score']])
            output_line.extend(
                ['time_for_running', time_finish_fitting - time_start_fitting])
          elif metric == 'mae':
            output_line = [
                count, dataset, str(split), mode, 'train', i,
                train_score[i]['mean-mean_absolute_error'], 'valid', i,
                valid_score[i]['mean-mean_absolute_error'], 'time_for_running',
                time_finish_fitting - time_start_fitting
                valid_score[i]['mean-mean_absolute_error']
            ]
            if test:
              output_line.extend(
                  ['test', i, test_score[i]['mean-mean_absolute_error']])
            output_line.extend(
                ['time_for_running', time_finish_fitting - time_start_fitting])

          writer.writerow(output_line)


def benchmark_classification(train_dataset,
                             valid_dataset,
                             test_dataset,
                             tasks,
                             transformers,
                             hyper_parameters,
                             n_features,
                             metric='auc',
                             model='tf',
                             seed=123):
                             seed=123,
                             test=False):
  """
  Calculate performance of different models on the specific dataset & tasks
  
@@ -238,13 +268,15 @@ def benchmark_classification(train_dataset,
      list of targets(tasks, datasets)
  transformers: BalancingTransformer struct
      loaded properties of dataset from load_* function
  hyper_parameters: dict
      hyper parameters including dropout rate, learning rate, etc.
  hyper_parameters: dict of list
      hyper parameters including layer size, dropout, learning rate, etc.
  n_features: integer
      number of features, or length of binary fingerprints
  model: string,  optional (default='tf')
      choice of which model to use, should be: rf, tf, tf_robust, logreg,
      graphconv
      irv, graphconv
  test: boolean
      whether to calculate test_set performance
  

  Returns
@@ -253,16 +285,20 @@ def benchmark_classification(train_dataset,
	predicting results(AUC) on training set
  valid_scores : dict
	predicting results(AUC) on valid set
  test_scores : dict
	predicting results(AUC) on test set
 

  """
  train_scores = {}
  valid_scores = {}
  test_scores = {}

  # Initialize metrics
  if metric == 'auc':
    classification_metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)

  assert model in ['rf', 'tf', 'tf_robust', 'logreg', 'graphconv']
  assert model in ['rf', 'tf', 'tf_robust', 'logreg', 'irv', 'graphconv']

  if model == 'tf':
    # Loading hyper parameters
@@ -301,6 +337,10 @@ def benchmark_classification(train_dataset,
    valid_scores['tf'] = model_tf.evaluate(
        valid_dataset, [classification_metric], transformers)

    if test:
      test_scores['tf'] = model_tf.evaluate(
          test_dataset, [classification_metric], transformers)

  if model == 'tf_robust':
    # Loading hyper parameters
    layer_sizes = hyper_parameters['layer_sizes']
@@ -348,6 +388,10 @@ def benchmark_classification(train_dataset,
    valid_scores['tf_robust'] = model_tf_robust.evaluate(
        valid_dataset, [classification_metric], transformers)

    if test:
      test_scores['tf_robust'] = model_tf_robust.evaluate(
          test_dataset, [classification_metric], transformers)

  if model == 'logreg':
    # Loading hyper parameters
    penalty = hyper_parameters['penalty']
@@ -377,6 +421,51 @@ def benchmark_classification(train_dataset,
    valid_scores['logreg'] = model_logreg.evaluate(
        valid_dataset, [classification_metric], transformers)

    if test:
      test_scores['logreg'] = model_logreg.evaluate(
          test_dataset, [classification_metric], transformers)

  if model == 'irv':
    # Loading hyper parameters
    penalty = hyper_parameters['penalty']
    penalty_type = hyper_parameters['penalty_type']
    batch_size = hyper_parameters['batch_size']
    nb_epoch = hyper_parameters['nb_epoch']
    learning_rate = hyper_parameters['learning_rate']
    n_K = hyper_parameters['n_K']

    # Transform fingerprints to IRV features
    transformer = dc.trans.IRVTransformer(n_K, len(tasks), train_dataset)
    train_dataset = transformer.transform(train_dataset)
    valid_dataset = transformer.transform(valid_dataset)
    if test:
      test_dataset = transformer.transform(test_dataset)

    # Building tensorflow IRV model
    model_irv = dc.models.TensorflowMultiTaskIRVClassifier(
        len(tasks),
        K=n_K,
        penalty=penalty,
        penalty_type=penalty_type,
        batch_size=batch_size,
        learning_rate=learning_rate,
        seed=seed)

    print('-------------------------------------')
    print('Start fitting by IRV classifier')
    model_irv.fit(train_dataset, nb_epoch=nb_epoch)

    # Evaluating tensorflow IRV model
    train_scores['irv'] = model_irv.evaluate(
        train_dataset, [classification_metric], transformers)

    valid_scores['irv'] = model_irv.evaluate(
        valid_dataset, [classification_metric], transformers)

    if test:
      test_scores['irv'] = model_irv.evaluate(
          test_dataset, [classification_metric], transformers)

  if model == 'graphconv':
    # Initialize model folder

@@ -427,6 +516,10 @@ def benchmark_classification(train_dataset,
        valid_scores['graphconv'] = model_graphconv.evaluate(
            valid_dataset, [classification_metric], transformers)

        if test:
          test_scores['graphconv'] = model_graphconv.evaluate(
              test_dataset, [classification_metric], transformers)

  if model == 'rf':
    # Initialize model folder

@@ -452,18 +545,24 @@ def benchmark_classification(train_dataset,
    valid_scores['rf'] = model_rf.evaluate(
        valid_dataset, [classification_metric], transformers)

  return train_scores, valid_scores
    if test:
      test_scores['rf'] = model_rf.evaluate(
          test_dataset, [classification_metric], transformers)

  return train_scores, valid_scores, test_scores


def benchmark_regression(train_dataset,
                         valid_dataset,
                         test_dataset,
                         tasks,
                         transformers,
                         hyper_parameters,
                         n_features,
                         metric='r2',
                         model='tf_regression',
                         seed=123):
                         seed=123,
                         test=False):
  """
  Calculate performance of different models on the specific dataset & tasks
  
@@ -477,23 +576,28 @@ def benchmark_regression(train_dataset,
      list of targets(tasks, datasets)
  transformers: BalancingTransformer struct
      loaded properties of dataset from load_* function
  hyper_parameters: dict
      hyper parameters including dropout rate, learning rate, etc.
  hyper_parameters: dict of list
      hyper parameters including layer size, dropout, learning rate, etc.
  n_features: integer
      number of features, or length of binary fingerprints
  model: string,  optional (default='tf_regression')
      choice of which model to use, should be: tf_regression, graphconvreg
  
  test: boolean
      whether to calculate test_set performance  

  Returns
  -------
  train_scores: dict
      predicting results(R2) on training set
      predicting results(R2 or mae) on training set
  valid_scores: dict
      predicting results(R2) on valid set
      predicting results(R2 or mae) on valid set  
  test_scores : dict
	predicting results(R2 or mae) on test set
 
  """
  train_scores = {}
  valid_scores = {}
  test_scores = {}

  # Initialize metrics
  if metric == 'r2':
@@ -540,6 +644,11 @@ def benchmark_regression(train_dataset,

    valid_scores['tf_regression'] = model_tf_regression.evaluate(
        valid_dataset, [regression_metric], transformers)

    if test:
      test_scores['tf_regression'] = model_tf_regression.evaluate(
          test_dataset, [regression_metric], transformers)

  if model == 'graphconvreg':
    # Initialize model folder

@@ -590,7 +699,11 @@ def benchmark_regression(train_dataset,
        valid_scores['graphconvreg'] = model_graphconvreg.evaluate(
            valid_dataset, [regression_metric], transformers)

  return train_scores, valid_scores
        if test:
          test_scores['graphconvreg'] = model_graphconvreg.evaluate(
              test_dataset, [regression_metric], transformers)

  return train_scores, valid_scores, test_scores


if __name__ == '__main__':
@@ -611,7 +724,7 @@ if __name__ == '__main__':
      action='append',
      dest='model_args',
      default=[],
      help='Choice of model: tf, tf_robust, logreg, graphconv, ' +
      help='Choice of model: tf, tf_robust, logreg, rf, irv, graphconv, ' +
      'tf_regression, graphconvreg')
  parser.add_argument(
      '-d',
@@ -619,12 +732,20 @@ if __name__ == '__main__':
      dest='dataset_args',
      default=[],
      help='Choice of dataset: tox21, sider, muv, toxcast, pcba, ' +
      'kaggle, delaney, nci, pdbbindi, chembl, clintox')
      'kaggle, delaney, nci, pdbbind, chembl, sampl, gdb7, clintox')
  parser.add_argument(
      '-t',
      action='store_true',
      dest='test',
      default=False,
      help='Evalute performance on test set')

  args = parser.parse_args()
  #Datasets and models used in the benchmark test
  splitters = args.splitter_args
  models = args.model_args
  datasets = args.dataset_args
  test = args.test

  if len(splitters) == 0:
    splitters = ['index', 'random', 'scaffold']
@@ -633,10 +754,11 @@ if __name__ == '__main__':
        'tf', 'tf_robust', 'logreg', 'graphconv', 'tf_regression',
        'graphconvreg'
    ]
    #irv and rf should be assigned manually
  if len(datasets) == 0:
    datasets = [
        'tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox', 'delaney', 'nci',
        'kaggle', 'pdbbind', 'chembl'
        'tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox', 'sampl',
        'delaney', 'nci', 'kaggle', 'pdbbind', 'chembl', 'gdb7'
    ]

  #input hyperparameters
@@ -680,6 +802,15 @@ if __name__ == '__main__':
      'learning_rate': 0.005
  }]

  hps['irv'] = [{
      'penalty': 0.,
      'penalty_type': 'l2',
      'batch_size': 50,
      'nb_epoch': 10,
      'learning_rate': 0.001,
      'n_K': 10
  }]

  hps['graphconv'] = [{
      'batch_size': 50,
      'nb_epoch': 15,
@@ -716,11 +847,21 @@ if __name__ == '__main__':
    for dataset in datasets:
      if dataset in ['tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox']:
        for model in models:
          if model in ['tf', 'tf_robust', 'logreg', 'graphconv', 'rf']:
          if model in ['tf', 'tf_robust', 'logreg', 'graphconv', 'rf', 'irv']:
            benchmark_loading_datasets(
                hps, dataset=dataset, model=model, split=split, out_path='.')
                hps,
                dataset=dataset,
                model=model,
                split=split,
                out_path='.',
                test=test)
      else:
        for model in models:
          if model in ['tf_regression', 'graphconvreg']:
            benchmark_loading_datasets(
                hps, dataset=dataset, model=model, split=split, out_path='.')
                hps,
                dataset=dataset,
                model=model,
                split=split,
                out_path='.',
                test=test)