Commit cef2632f authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #382 from miaecle/gdb7_mod

SAMPL dataset
parents 294805bb ab936d7b
Loading
Loading
Loading
Loading
+15 −6
Original line number Diff line number Diff line
@@ -245,12 +245,18 @@ Scaffold splitting

|Dataset         |Model               |Splitting   |Train score/R2|Valid score/R2|
|----------------|--------------------|------------|--------------|--------------|
|delaney         |MT-NN regression    |Index       |0.773         |0.574         |
|                |graphconv regression|Index       |0.991         |0.825         |
|                |MT-NN regression    |Random      |0.769         |0.591         |
|                |graphconv regression|Random      |0.996         |0.873         |
|                |MT-NN regression    |Scaffold    |0.782         |0.426         |
|                |graphconv regression|Scaffold    |0.994         |0.606         |
|delaney         |MT-NN regression    |Index       |0.868         |0.578         |
|                |graphconv regression|Index       |0.967         |0.790         |
|                |MT-NN regression    |Random      |0.865         |0.574         |
|                |graphconv regression|Random      |0.964         |0.782         |
|                |MT-NN regression    |Scaffold    |0.866         |0.342         |
|                |graphconv regression|Scaffold    |0.967         |0.606         |
|sampl           |MT-NN regression    |Index       |0.917         |0.764         |
|                |graphconv regression|Index       |0.982         |0.864         |
|                |MT-NN regression    |Random      |0.908         |0.830         |
|                |graphconv regression|Random      |0.987         |0.868         |
|                |MT-NN regression    |Scaffold    |0.891         |0.217         |
|                |graphconv regression|Scaffold    |0.985         |0.666         |
|nci             |MT-NN regression    |Index       |0.171         |0.062         |
|                |graphconv regression|Index       |0.123         |0.048         |
|                |MT-NN regression    |Random      |0.168         |0.085         |
@@ -286,6 +292,7 @@ Number of tasks and examples in the datasets
|sider           |27         |1427       |
|toxcast         |617        |8615       |
|delaney         |1          |1128       |
|sampl           |1          |643        |
|kaggle          |15         |173065     |
|nci             |60         |19127      |
|pdbbind(core)   |1          |195        |
@@ -322,6 +329,8 @@ Time needed for benchmark test(~20h in total)
|                |graph convolution   |80              |900            |
|delaney         |MT-NN regression    |10              |40             |
|                |graphconv regression|10              |40             |
|sampl           |MT-NN regression    |10              |30             |
|                |graphconv regression|10              |40             |
|nci             |MT-NN regression    |400             |1200           |
|                |graphconv regression|400             |2500           |
|pdbbind(core)   |MT-NN regression    |0(featurized)   |30             |
+8 −10
Original line number Diff line number Diff line
@@ -351,20 +351,18 @@ class IndiceSplitter(Splitter):
    """
    num_datapoints = len(dataset)
    indices = np.arange(num_datapoints).tolist()
    train_indices = []
    if self.valid_indices is None:
      self.valid_indices = []
    else:
      for indice in indices:
        if indice in self.valid_indices:
          indices.remove(indice)
    if self.test_indices is None:
      self.test_indices = []
    else:
    valid_test = self.valid_indices
    valid_test.extend(self.test_indices)
    for indice in indices:
        if indice in self.valid_indices:
          indices.remove(indice)
      if not indice in valid_test:
        train_indices.append(indice)
    
    return (indices, self.valid_indices, self.test_indices)
    return (train_indices, self.valid_indices, self.test_indices)


class ScaffoldSplitter(Splitter):
+6 −5
Original line number Diff line number Diff line
@@ -52,6 +52,7 @@ from nci.nci_datasets import load_nci
from pdbbind.pdbbind_datasets import load_pdbbind_grid
from chembl.chembl_datasets import load_chembl
from gdb7.gdb7_datasets import load_gdb7
from sampl.sampl_datasets import load_sampl

def benchmark_loading_datasets(hyper_parameters, 
                               dataset='tox21', model='tf', split=None,
@@ -77,7 +78,8 @@ def benchmark_loading_datasets(hyper_parameters,
  
  if dataset in ['muv', 'pcba', 'tox21', 'sider', 'toxcast']:
    mode = 'classification'
  elif dataset in ['kaggle', 'delaney', 'nci', 'pdbbind', 'chembl', 'gdb7']:
  elif dataset in ['kaggle', 'delaney', 'nci', 'pdbbind', 'chembl', 
                   'gdb7', 'sampl']:
    mode = 'regression'
  else:
    raise ValueError('Dataset not supported')
@@ -130,7 +132,8 @@ def benchmark_loading_datasets(hyper_parameters,
                       'sider': load_sider, 'toxcast': load_toxcast,
                       'kaggle': load_kaggle, 'delaney': load_delaney,
                       'pdbbind': load_pdbbind_grid,
                       'chembl': load_chembl, 'gdb7': load_gdb7}
                       'chembl': load_chembl, 'gdb7': load_gdb7,
                       'sampl': load_sampl}
  
  print('-------------------------------------')
  print('Benchmark %s on dataset: %s' % (model, dataset))
@@ -163,8 +166,6 @@ def benchmark_loading_datasets(hyper_parameters,
          model=model)      
    elif mode == 'regression':
      metric = 'r2'
      if dataset in ['gdb7']:
        metric = 'mae'
      train_score, valid_score = benchmark_regression(
          train_dataset, valid_dataset, tasks, 
          transformers, hp, n_features, metric=metric,
@@ -594,7 +595,7 @@ if __name__ == '__main__':
                           'dropouts': [0.25, 0.25], 
                           'penalty': 0.0005, 'penalty_type': 'l2', 
                           'batch_size': 128, 'nb_epoch': 50, 
                           'learning_rate': 0.00008}]
                           'learning_rate': 0.0008}]
  
  hps['graphconvreg'] = [{'batch_size': 128, 'nb_epoch': 20, 
                          'learning_rate': 0.0005, 'n_filters': 128, 
+2 −2
Original line number Diff line number Diff line
@@ -17,7 +17,7 @@ def load_gdb7_from_mat(split=0):
  if not os.path.exists('qm7.mat'): os.system('wget http://www.quantum-machine.org/data/qm7.mat')
  dataset = scipy.io.loadmat('qm7.mat')
  
  P = dataset['P'][range(0,split)+range(split+1,5)].flatten()
  P = dataset['P'][list(range(0,split))+list(range(split+1,5))].flatten()
  X = dataset['X'][P]
  y = dataset['T'][0,P]
  w = np.ones_like(y)
@@ -67,7 +67,7 @@ def load_gdb7(featurizer=None, split='random'):
  with open(split_file, 'r') as f:
    reader = csv.reader(f)
    for row in reader:
      row_int = (np.asarray(list(map(int, row)))-1).tolist()
      row_int = (np.asarray(list(map(int, row)))).tolist()
      split_indices.append(row_int)
  
  
+644 −0

File added.

Preview size limit exceeded, changes collapsed.

Loading