Commit ad6eb4b5 authored by miaecle's avatar miaecle
Browse files

merge in gdb7 to benchmark

parent 7d5d914b
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -15,8 +15,8 @@ from deepchem.feat.base_classes import UserDefinedFeaturizer
from deepchem.feat.graph_features import ConvMolFeaturizer
from deepchem.feat.fingerprints import CircularFingerprint
from deepchem.feat.basic import RDKitDescriptors
from deepchem.feat.coulomb_matrices import CoulombMatrixEig
from deepchem.feat.coulomb_matrices import CoulombMatrix
from deepchem.feat.coulomb_matrices import CoulombMatrixEig
from deepchem.feat.grid_featurizer import GridFeaturizer
from deepchem.feat.nnscore_utils import hydrogenate_and_compute_partial_charges
from deepchem.feat.binding_pocket_features import BindingPocketFeaturizer
+61 −30
Original line number Diff line number Diff line
@@ -17,7 +17,7 @@ on datasets: muv, pcba, tox21, sider, toxcast
Giving regression performances of:
    MultitaskDNN(tf_regression),
    Graph convolution regression(graphconvreg)
on datasets: delaney, nci, kaggle, pdbbind
on datasets: delaney, nci, kaggle, pdbbind, gdb7, chembl

time estimation listed in README file

@@ -51,6 +51,7 @@ from delaney.delaney_datasets import load_delaney
from nci.nci_datasets import load_nci
from pdbbind.pdbbind_datasets import load_pdbbind_grid
from chembl.chembl_datasets import load_chembl
from gdb7.gdb7_datasets import load_gdb7

def benchmark_loading_datasets(hyper_parameters, 
                               dataset='tox21', model='tf', split=None,
@@ -76,12 +77,12 @@ def benchmark_loading_datasets(hyper_parameters,
  
  if dataset in ['muv', 'pcba', 'tox21', 'sider', 'toxcast']:
    mode = 'classification'
  elif dataset in ['kaggle', 'delaney', 'nci', 'pdbbind', 'chembl']:
  elif dataset in ['kaggle', 'delaney', 'nci', 'pdbbind', 'chembl', 'gdb7']:
    mode = 'regression'
  else:
    raise ValueError('Dataset not supported')
  
  #assigning featurizer
  # Assigning featurizer
  if model in ['graphconv', 'graphconvreg']:
    featurizer = 'GraphConv'
    n_features = 75
@@ -91,6 +92,7 @@ def benchmark_loading_datasets(hyper_parameters,
  else:
    raise ValueError('Model not supported')
  
  # Some exceptions in datasets
  if dataset in ['kaggle']:
    featurizer = None # kaggle dataset use its own features
    if split in ['random', 'scaffold']:
@@ -107,7 +109,20 @@ def benchmark_loading_datasets(hyper_parameters,
    if not model in ['tf_regression']:
      return

  if not split in [None, 'index','random','scaffold']:
  if dataset in ['gdb7']:
    featurizer = None
    if split in ['scaffold']: # gdb7 supports index, random and indice splitting
      return
    if not model in ['tf_regression']:
      return

  if split in ['year']:
    if not dataset in ['chembl']:
      return
  elif split in ['indice']:
    if not dataset in ['gdb7']:
      return
  elif not split in [None, 'index','random','scaffold']:
    raise ValueError('Splitter function not supported')
  
  loading_functions = {'tox21': load_tox21, 'muv': load_muv,
@@ -115,7 +130,7 @@ def benchmark_loading_datasets(hyper_parameters,
                       'sider': load_sider, 'toxcast': load_toxcast,
                       'kaggle': load_kaggle, 'delaney': load_delaney,
                       'pdbbind': load_pdbbind_grid,
                       'chembl': load_chembl}
                       'chembl': load_chembl, 'gdb7': load_gdb7}
  
  print('-------------------------------------')
  print('Benchmark %s on dataset: %s' % (model, dataset))
@@ -133,22 +148,26 @@ def benchmark_loading_datasets(hyper_parameters,
  train_dataset, valid_dataset, test_dataset = all_dataset
  time_finish_loading = time.time()
  # time_finish_loading-time_start is the time(s) used for dataset loading
  if dataset in ['kaggle','pdbbind']:
  if dataset in ['kaggle','pdbbind', 'gdb7']:
    n_features = train_dataset.get_data_shape()[0]
    #kaggle dataset has customized features
    # dataset has customized features
    
  # running model
  for count, hp in enumerate(hyper_parameters[model]):
    time_start_fitting = time.time()
    if mode == 'classification':
      metric = 'auc'
      train_score, valid_score = benchmark_classification(
          train_dataset, valid_dataset, tasks, 
          transformers, hp, n_features,
          transformers, hp, n_features, metric=metric,
          model=model)      
    elif mode == 'regression':
      metric = 'r2'
      if dataset in ['gdb7']:
        metric = 'mae'
      train_score, valid_score = benchmark_regression(
          train_dataset, valid_dataset, tasks, 
          transformers, hp, n_features,
          transformers, hp, n_features, metric=metric,
          model=model)  
    time_finish_fitting = time.time()
    
@@ -165,16 +184,24 @@ def benchmark_loading_datasets(hyper_parameters,
          writer.writerow(output_line)
      else:
        for i in train_score:
          if metric == 'r2':
            output_line = [count, dataset, str(split), mode, 'train', i, 
                           train_score[i]['mean-pearson_r2_score'], 'valid', i, 
                           valid_score[i]['mean-pearson_r2_score'], 
                           'time_for_running',
                           time_finish_fitting-time_start_fitting]
          elif metric == 'mae':
            output_line = [count, dataset, str(split), mode, 'train', i, 
                           train_score[i]['mean-mean_absolute_error'], 'valid', i, 
                           valid_score[i]['mean-mean_absolute_error'], 
                           'time_for_running',
                           time_finish_fitting-time_start_fitting]
 
          writer.writerow(output_line)

def benchmark_classification(train_dataset, valid_dataset, tasks,
                             transformers, hyper_parameters, 
                             n_features, model='tf', seed=123):
                             n_features, metric='auc', model='tf', seed=123):
  """
  Calculate performance of different models on the specific dataset & tasks
  
@@ -209,6 +236,7 @@ def benchmark_classification(train_dataset, valid_dataset, tasks,
  valid_scores = {}
  
  # Initialize metrics
  if metric == 'auc':
    classification_metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)
  
  assert model in ['rf', 'tf', 'tf_robust', 'logreg', 'graphconv']
@@ -384,8 +412,8 @@ def benchmark_classification(train_dataset, valid_dataset, tasks,

  
def benchmark_regression(train_dataset, valid_dataset, tasks,
                         transformers, hyper_parameters, 
                         n_features, model='tf_regression', seed=123):
                         transformers, hyper_parameters, n_features, 
                         metric='r2', model='tf_regression', seed=123):
  """
  Calculate performance of different models on the specific dataset & tasks
  
@@ -418,7 +446,10 @@ def benchmark_regression(train_dataset, valid_dataset, tasks,
  valid_scores = {}
  
  # Initialize metrics
  if metric == 'r2':
    regression_metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)
  elif metric == 'mae':
    regression_metric = dc.metrics.Metric(dc.metrics.mean_absolute_error, np.mean)

  assert model in ['tf_regression', 'graphconvreg']

@@ -513,7 +544,7 @@ if __name__ == '__main__':
           'tf_regression, graphconvreg')
  parser.add_argument('-d', action='append', dest='dataset_args', default=[], 
      help='Choice of dataset: tox21, sider, muv, toxcast, pcba, ' + 
           'kaggle, delaney, nci, pdbbind')
           'kaggle, delaney, nci, pdbbindi, chembl, gdb7')
  args = parser.parse_args()
  #Datasets and models used in the benchmark test
  splitters = args.splitter_args
@@ -527,7 +558,7 @@ if __name__ == '__main__':
              'tf_regression', 'graphconvreg']
  if len(datasets) == 0:
    datasets = ['tox21', 'sider', 'muv', 'toxcast', 'pcba', 
                'delaney', 'nci', 'kaggle', 'pdbbind', 'chembl']
                'delaney', 'nci', 'kaggle', 'pdbbind', 'chembl', 'gdb7']

  #input hyperparameters
  #tf: dropouts, learning rate, layer_sizes, weight initial stddev,penalty,
+0 −0

Empty file added.

+1 −3
Original line number Diff line number Diff line
@@ -11,7 +11,7 @@ import shutil
import deepchem as dc
import csv

def load_gdb7(featurizer=None, split='indice'):
def load_gdb7(featurizer=None, split='random'):
  """Load gdb7 datasets."""
  # Featurize gdb7 dataset
  print("About to featurize gdb7 dataset.")
@@ -51,6 +51,4 @@ def load_gdb7(featurizer=None, split='indice'):
               'indice': dc.splits.IndiceSplitter(valid_indices=split_indices[1])}
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
  print(valid.X.shape)
  print(train.X.shape)
  return gdb7_tasks, (train, valid, test), transformers