Commit 6df39d4e authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #362 from miaecle/chembl

Merge in gdb7 and chembl
parents e9eb598a 1998db52
Loading
Loading
Loading
Loading
+15 −1
Original line number Diff line number Diff line
@@ -260,8 +260,18 @@ Scaffold splitting
|pdbbind(core)   |MT-NN regression    |Random      |0.973         |0.494         |
|pdbbind(refined)|MT-NN regression    |Random      |0.987         |0.503         |
|pdbbind(full)   |MT-NN regression    |Random      |0.983         |0.528         |
|chembl          |MT-NN regression    |Index       |0.443         |0.427         |
|                |MT-NN regression    |Random      |0.464         |0.434         |
|                |MT-NN regression    |Scaffold    |0.484         |0.361         |
|gdb7            |MT-NN regression    |Index       |0.961         |0.011         |
|                |MT-NN regression    |Random      |0.742         |0.732         |
|kaggle          |MT-NN regression    |User-defined|0.748         |0.452         |

|Dataset         |Model               |Splitting   |Train score/MAE(kcal/mol)|Valid score/MAE(kcal/mol)|
|----------------|--------------------|------------|-------------------------|-------------------------|
|gdb7            |MT-NN regression    |Index       |44.5                     |185.6                    |
|                |MT-NN regression    |Random      |86.1                     |92.2                     |

* General features

Number of tasks and examples in the datasets
@@ -279,6 +289,8 @@ Number of tasks and examples in the datasets
|pdbbind(core)   |1          |195        |
|pdbbind(refined)|1          |3706       |
|pdbbind(full)   |1          |11908      |
|chembl(5thresh) |691        |23871      |
|gdb7            |1          |7165       |



@@ -313,6 +325,8 @@ Time needed for benchmark test(~20h in total)
|pdbbind(core)   |MT-NN regression    |0(featurized)   |30             |
|pdbbind(refined)|MT-NN regression    |0(featurized)   |40             |
|pdbbind(full)   |MT-NN regression    |0(featurized)   |60             |
|chembl          |MT-NN regression    |200             |9000           |
|gdb7            |MT-NN regression    |10              |110            |
|kaggle          |MT-NN regression    |2200            |3200           |


+1 −0
Original line number Diff line number Diff line
@@ -15,6 +15,7 @@ from deepchem.feat.base_classes import UserDefinedFeaturizer
from deepchem.feat.graph_features import ConvMolFeaturizer
from deepchem.feat.fingerprints import CircularFingerprint
from deepchem.feat.basic import RDKitDescriptors
from deepchem.feat.coulomb_matrices import CoulombMatrix
from deepchem.feat.coulomb_matrices import CoulombMatrixEig
from deepchem.feat.grid_featurizer import GridFeaturizer
from deepchem.feat.nnscore_utils import hydrogenate_and_compute_partial_charges
+1 −0
Original line number Diff line number Diff line
@@ -10,5 +10,6 @@ from deepchem.splits.splitters import *
from deepchem.splits.splitters import ScaffoldSplitter
from deepchem.splits.splitters import SpecifiedSplitter
from deepchem.splits.splitters import IndexSplitter
from deepchem.splits.splitters import IndiceSplitter
from deepchem.splits.task_splitter import merge_fold_datasets
from deepchem.splits.task_splitter import TaskSplitter
+39 −0
Original line number Diff line number Diff line
@@ -327,6 +327,45 @@ class IndexSplitter(Splitter):
    return (indices[:train_cutoff], indices[train_cutoff:valid_cutoff],
            indices[valid_cutoff:])

class IndiceSplitter(Splitter):
  """
  Class for splits based on input order. 
  """
  def __init__(self, verbose=False, valid_indices=None, test_indices=None):
    """
    Parameters
    -----------
    valid_indices: list of int
        indices of samples in the valid set
    test_indices: list of int
        indices of samples in the test set
    """
    self.verbose = verbose
    self.valid_indices = valid_indices
    self.test_indices = test_indices
    
  def split(self, dataset, seed=None, frac_train=.8, frac_valid=.1,
            frac_test=.1, log_every_n=None):
    """
    Splits internal compounds into train/validation/test in designated order.
    """
    num_datapoints = len(dataset)
    indices = np.arange(num_datapoints).tolist()
    if self.valid_indices is None:
      self.valid_indices = []
    else:
      for indice in indices:
        if indice in self.valid_indices:
          indices.remove(indice)
    if self.test_indices is None:
      self.test_indices = []
    else:
      for indice in indices:
        if indice in self.valid_indices:
          indices.remove(indice)

    return (indices, self.valid_indices, self.test_indices)


class ScaffoldSplitter(Splitter):
  """
+63 −30
Original line number Diff line number Diff line
@@ -17,7 +17,7 @@ on datasets: muv, pcba, tox21, sider, toxcast
Giving regression performances of:
    MultitaskDNN(tf_regression),
    Graph convolution regression(graphconvreg)
on datasets: delaney, nci, kaggle, pdbbind
on datasets: delaney, nci, kaggle, pdbbind, gdb7, chembl

time estimation listed in README file

@@ -50,6 +50,8 @@ from kaggle.kaggle_datasets import load_kaggle
from delaney.delaney_datasets import load_delaney
from nci.nci_datasets import load_nci
from pdbbind.pdbbind_datasets import load_pdbbind_grid
from chembl.chembl_datasets import load_chembl
from gdb7.gdb7_datasets import load_gdb7

def benchmark_loading_datasets(hyper_parameters, 
                               dataset='tox21', model='tf', split=None,
@@ -75,12 +77,12 @@ def benchmark_loading_datasets(hyper_parameters,
  
  if dataset in ['muv', 'pcba', 'tox21', 'sider', 'toxcast']:
    mode = 'classification'
  elif dataset in ['kaggle', 'delaney', 'nci','pdbbind']:
  elif dataset in ['kaggle', 'delaney', 'nci', 'pdbbind', 'chembl', 'gdb7']:
    mode = 'regression'
  else:
    raise ValueError('Dataset not supported')
  
  #assigning featurizer
  # Assigning featurizer
  if model in ['graphconv', 'graphconvreg']:
    featurizer = 'GraphConv'
    n_features = 75
@@ -90,6 +92,7 @@ def benchmark_loading_datasets(hyper_parameters,
  else:
    raise ValueError('Model not supported')
  
  # Some exceptions in datasets
  if dataset in ['kaggle']:
    featurizer = None # kaggle dataset use its own features
    if split in ['random', 'scaffold']:
@@ -106,14 +109,28 @@ def benchmark_loading_datasets(hyper_parameters,
    if not model in ['tf_regression']:
      return

  if not split in [None, 'index','random','scaffold']:
  if dataset in ['gdb7']:
    featurizer = None
    if split in ['scaffold']: # gdb7 supports index, random and indice splitting
      return
    if not model in ['tf_regression']:
      return

  if split in ['year']:
    if not dataset in ['chembl']:
      return
  elif split in ['indice']:
    if not dataset in ['gdb7']:
      return
  elif not split in [None, 'index','random','scaffold']:
    raise ValueError('Splitter function not supported')
  
  loading_functions = {'tox21': load_tox21, 'muv': load_muv,
                       'pcba': load_pcba, 'nci': load_nci,
                       'sider': load_sider, 'toxcast': load_toxcast,
                       'kaggle': load_kaggle, 'delaney': load_delaney,
                       'pdbbind': load_pdbbind_grid}
                       'pdbbind': load_pdbbind_grid,
                       'chembl': load_chembl, 'gdb7': load_gdb7}
  
  print('-------------------------------------')
  print('Benchmark %s on dataset: %s' % (model, dataset))
@@ -131,22 +148,26 @@ def benchmark_loading_datasets(hyper_parameters,
  train_dataset, valid_dataset, test_dataset = all_dataset
  time_finish_loading = time.time()
  # time_finish_loading-time_start is the time(s) used for dataset loading
  if dataset in ['kaggle','pdbbind']:
  if dataset in ['kaggle', 'pdbbind', 'gdb7']:
    n_features = train_dataset.get_data_shape()[0]
    #kaggle dataset has customized features
    # dataset has customized features
    
  # running model
  for count, hp in enumerate(hyper_parameters[model]):
    time_start_fitting = time.time()
    if mode == 'classification':
      metric = 'auc'
      train_score, valid_score = benchmark_classification(
          train_dataset, valid_dataset, tasks, 
          transformers, hp, n_features,
          transformers, hp, n_features, metric=metric,
          model=model)      
    elif mode == 'regression':
      metric = 'r2'
      if dataset in ['gdb7']:
        metric = 'mae'
      train_score, valid_score = benchmark_regression(
          train_dataset, valid_dataset, tasks, 
          transformers, hp, n_features,
          transformers, hp, n_features, metric=metric,
          model=model)  
    time_finish_fitting = time.time()
    
@@ -163,16 +184,24 @@ def benchmark_loading_datasets(hyper_parameters,
          writer.writerow(output_line)
      else:
        for i in train_score:
          if metric == 'r2':
            output_line = [count, dataset, str(split), mode, 'train', i, 
                           train_score[i]['mean-pearson_r2_score'], 'valid', i, 
                           valid_score[i]['mean-pearson_r2_score'], 
                           'time_for_running',
                           time_finish_fitting-time_start_fitting]
          elif metric == 'mae':
            output_line = [count, dataset, str(split), mode, 'train', i, 
                           train_score[i]['mean-mean_absolute_error'], 'valid', i, 
                           valid_score[i]['mean-mean_absolute_error'], 
                           'time_for_running',
                           time_finish_fitting-time_start_fitting]
 
          writer.writerow(output_line)

def benchmark_classification(train_dataset, valid_dataset, tasks,
                             transformers, hyper_parameters, 
                             n_features, model='tf', seed=123):
                             n_features, metric='auc', model='tf', seed=123):
  """
  Calculate performance of different models on the specific dataset & tasks
  
@@ -207,6 +236,7 @@ def benchmark_classification(train_dataset, valid_dataset, tasks,
  valid_scores = {}
  
  # Initialize metrics
  if metric == 'auc':
    classification_metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)
  
  assert model in ['rf', 'tf', 'tf_robust', 'logreg', 'graphconv']
@@ -382,8 +412,8 @@ def benchmark_classification(train_dataset, valid_dataset, tasks,

  
def benchmark_regression(train_dataset, valid_dataset, tasks,
                         transformers, hyper_parameters, 
                         n_features, model='tf_regression', seed=123):
                         transformers, hyper_parameters, n_features, 
                         metric='r2', model='tf_regression', seed=123):
  """
  Calculate performance of different models on the specific dataset & tasks
  
@@ -416,7 +446,10 @@ def benchmark_regression(train_dataset, valid_dataset, tasks,
  valid_scores = {}
  
  # Initialize metrics
  if metric == 'r2':
    regression_metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)
  elif metric == 'mae':
    regression_metric = dc.metrics.Metric(dc.metrics.mean_absolute_error, np.mean)

  assert model in ['tf_regression', 'graphconvreg']

@@ -511,7 +544,7 @@ if __name__ == '__main__':
           'tf_regression, graphconvreg')
  parser.add_argument('-d', action='append', dest='dataset_args', default=[], 
      help='Choice of dataset: tox21, sider, muv, toxcast, pcba, ' + 
           'kaggle, delaney, nci, pdbbind')
           'kaggle, delaney, nci, pdbbindi, chembl, gdb7')
  args = parser.parse_args()
  #Datasets and models used in the benchmark test
  splitters = args.splitter_args
@@ -525,7 +558,7 @@ if __name__ == '__main__':
              'tf_regression', 'graphconvreg']
  if len(datasets) == 0:
    datasets = ['tox21', 'sider', 'muv', 'toxcast', 'pcba', 
                'delaney', 'nci', 'kaggle', 'pdbbind']
                'delaney', 'nci', 'kaggle', 'pdbbind', 'chembl', 'gdb7']

  #input hyperparameters
  #tf: dropouts, learning rate, layer_sizes, weight initial stddev,penalty,
Loading