Commit 5244ca0b authored by ZHENQIN WU's avatar ZHENQIN WU
Browse files

SAMPL performance

parent 40e80761
Loading
Loading
Loading
Loading
+15 −6
Original line number Diff line number Diff line
@@ -245,12 +245,18 @@ Scaffold splitting

|Dataset         |Model               |Splitting   |Train score/R2|Valid score/R2|
|----------------|--------------------|------------|--------------|--------------|
|delaney         |MT-NN regression    |Index       |0.773         |0.574         |
|                |graphconv regression|Index       |0.991         |0.825         |
|                |MT-NN regression    |Random      |0.769         |0.591         |
|                |graphconv regression|Random      |0.996         |0.873         |
|                |MT-NN regression    |Scaffold    |0.782         |0.426         |
|                |graphconv regression|Scaffold    |0.994         |0.606         |
|delaney         |MT-NN regression    |Index       |0.868         |0.578         |
|                |graphconv regression|Index       |0.967         |0.790         |
|                |MT-NN regression    |Random      |0.865         |0.574         |
|                |graphconv regression|Random      |0.964         |0.782         |
|                |MT-NN regression    |Scaffold    |0.866         |0.342         |
|                |graphconv regression|Scaffold    |0.967         |0.606         |
|sampl           |MT-NN regression    |Index       |0.917         |0.764         |
|                |graphconv regression|Index       |0.982         |0.864         |
|                |MT-NN regression    |Random      |0.908         |0.830         |
|                |graphconv regression|Random      |0.987         |0.868         |
|                |MT-NN regression    |Scaffold    |0.891         |0.217         |
|                |graphconv regression|Scaffold    |0.985         |0.666         |
|nci             |MT-NN regression    |Index       |0.171         |0.062         |
|                |graphconv regression|Index       |0.123         |0.048         |
|                |MT-NN regression    |Random      |0.168         |0.085         |
@@ -286,6 +292,7 @@ Number of tasks and examples in the datasets
|sider           |27         |1427       |
|toxcast         |617        |8615       |
|delaney         |1          |1128       |
|sampl           |1          |643        |
|kaggle          |15         |173065     |
|nci             |60         |19127      |
|pdbbind(core)   |1          |195        |
@@ -322,6 +329,8 @@ Time needed for benchmark test(~20h in total)
|                |graph convolution   |80              |900            |
|delaney         |MT-NN regression    |10              |40             |
|                |graphconv regression|10              |40             |
|sampl           |MT-NN regression    |10              |30             |
|                |graphconv regression|10              |40             |
|nci             |MT-NN regression    |400             |1200           |
|                |graphconv regression|400             |2500           |
|pdbbind(core)   |MT-NN regression    |0(featurized)   |30             |
+6 −5
Original line number Diff line number Diff line
@@ -52,6 +52,7 @@ from nci.nci_datasets import load_nci
from pdbbind.pdbbind_datasets import load_pdbbind_grid
from chembl.chembl_datasets import load_chembl
from gdb7.gdb7_datasets import load_gdb7
from sampl.sampl_datasets import load_sampl

def benchmark_loading_datasets(hyper_parameters, 
                               dataset='tox21', model='tf', split=None,
@@ -77,7 +78,8 @@ def benchmark_loading_datasets(hyper_parameters,
  
  if dataset in ['muv', 'pcba', 'tox21', 'sider', 'toxcast']:
    mode = 'classification'
  elif dataset in ['kaggle', 'delaney', 'nci', 'pdbbind', 'chembl', 'gdb7']:
  elif dataset in ['kaggle', 'delaney', 'nci', 'pdbbind', 'chembl', 
                   'gdb7', 'sampl']:
    mode = 'regression'
  else:
    raise ValueError('Dataset not supported')
@@ -130,7 +132,8 @@ def benchmark_loading_datasets(hyper_parameters,
                       'sider': load_sider, 'toxcast': load_toxcast,
                       'kaggle': load_kaggle, 'delaney': load_delaney,
                       'pdbbind': load_pdbbind_grid,
                       'chembl': load_chembl, 'gdb7': load_gdb7}
                       'chembl': load_chembl, 'gdb7': load_gdb7,
                       'sampl': load_sampl}
  
  print('-------------------------------------')
  print('Benchmark %s on dataset: %s' % (model, dataset))
@@ -163,8 +166,6 @@ def benchmark_loading_datasets(hyper_parameters,
          model=model)      
    elif mode == 'regression':
      metric = 'r2'
      if dataset in ['gdb7']:
        metric = 'mae'
      train_score, valid_score = benchmark_regression(
          train_dataset, valid_dataset, tasks, 
          transformers, hp, n_features, metric=metric,
@@ -594,7 +595,7 @@ if __name__ == '__main__':
                           'dropouts': [0.25, 0.25], 
                           'penalty': 0.0005, 'penalty_type': 'l2', 
                           'batch_size': 128, 'nb_epoch': 50, 
                           'learning_rate': 0.00008}]
                           'learning_rate': 0.0008}]
  
  hps['graphconvreg'] = [{'batch_size': 128, 'nb_epoch': 20, 
                          'learning_rate': 0.0005, 'n_filters': 128, 
+0 −0

File moved.

+0 −0

Empty file added.

+2 −2
Original line number Diff line number Diff line
@@ -10,14 +10,14 @@ import numpy as np
import shutil
import deepchem as dc

def load_SAMPL(featurizer='ECFP', split='index'):
def load_sampl(featurizer='ECFP', split='index'):
  """Load SAMPL datasets."""
  # Featurize SAMPL dataset
  print("About to featurize SAMPL dataset.")
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(
      current_dir, "./SAMPL.csv")
  SAMPL_tasks = ['expt', 'calc']
  SAMPL_tasks = ['expt']
  if featurizer == 'ECFP':
    featurizer = dc.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
Loading