Commit 49bc3cec authored by miaecle's avatar miaecle
Browse files

Merge remote-tracking branch 'remotes/mine/gdb7' into tf0.12

parents e1e929f6 fd9c2044
Loading
Loading
Loading
Loading
+63 −52
Original line number Diff line number Diff line
@@ -244,7 +244,7 @@ Scaffold splitting
* Regression

|Dataset         |Model               |Splitting   |Train score/R2|Valid score/R2|
|-----------|--------------------|------------|--------------|--------------|
|----------------|--------------------|------------|--------------|--------------|
|delaney         |MT-NN regression    |Index       |0.773         |0.574         |
|                |graphconv regression|Index       |0.991         |0.825         |
|                |MT-NN regression    |Random      |0.769         |0.591         |
@@ -257,6 +257,9 @@ Scaffold splitting
|                |graphconv regression|Random      |0.117         |0.076         |
|                |MT-NN regression    |Scaffold    |0.180         |0.052         |
|                |graphconv regression|Scaffold    |0.131         |0.046         |
|pdbbind(core)   |MT-NN regression    |Random      |0.973         |0.494         |
|pdbbind(refined)|MT-NN regression    |Random      |0.987         |0.503         |
|pdbbind(full)   |MT-NN regression    |Random      |0.983         |0.528         |
|kaggle          |MT-NN regression    |User-defined|0.748         |0.452         |

* General features
@@ -264,7 +267,7 @@ Scaffold splitting
Number of tasks and examples in the datasets

|Dataset         |N(tasks)	|N(samples) |
|-----------|-----------|-----------| 
|----------------|-----------|-----------| 
|tox21           |12         |8014       |
|muv             |17         |93127      |
|pcba            |128        |439863     |
@@ -273,11 +276,16 @@ Number of tasks and examples in the datasets
|delaney         |1          |1128       |
|kaggle          |15         |173065     |
|nci             |60         |19127      |
|pdbbind(core)   |1          |195        |
|pdbbind(refined)|1          |3706       |
|pdbbind(full)   |1          |11908      |



Time needed for benchmark test(~20h in total)

|Dataset         |Model               |Time(loading)/s |Time(running)/s|
|-----------|--------------------|----------------|---------------| 
|----------------|--------------------|----------------|---------------| 
|tox21           |logistic regression |30              |60             |
|                |Multitask network   |30              |60             |
|                |robust MT-NN        |30              |90             |
@@ -302,6 +310,9 @@ Time needed for benchmark test(~20h in total)
|                |graphconv regression|10              |40             |
|nci             |MT-NN regression    |400             |1200           |
|                |graphconv regression|400             |2500           |
|pdbbind(core)   |MT-NN regression    |0(featurized)   |30             |
|pdbbind(refined)|MT-NN regression    |0(featurized)   |40             |
|pdbbind(full)   |MT-NN regression    |0(featurized)   |60             |
|kaggle          |MT-NN regression    |2200            |3200           |


+4 −2
Original line number Diff line number Diff line
@@ -269,6 +269,7 @@ class MolecularWeightSplitter(Splitter):
    """

    np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
    if not seed is None:
      np.random.seed(seed)

    mws = []
@@ -299,6 +300,7 @@ class RandomSplitter(Splitter):
    Splits internal compounds randomly into train/validation/test.
    """
    np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
    if not seed is None:
      np.random.seed(seed)
    num_datapoints = len(dataset)
    train_cutoff = int(frac_train * num_datapoints)
+21 −9
Original line number Diff line number Diff line
@@ -17,7 +17,7 @@ on datasets: muv, pcba, tox21, sider, toxcast
Giving regression performances of:
    MultitaskDNN(tf_regression),
    Graph convolution regression(graphconvreg)
on datasets: delaney, nci, kaggle
on datasets: delaney, nci, kaggle, pdbbind

time estimation listed in README file

@@ -49,6 +49,7 @@ from sider.sider_datasets import load_sider
from kaggle.kaggle_datasets import load_kaggle
from delaney.delaney_datasets import load_delaney
from nci.nci_datasets import load_nci
from pdbbind.pdbbind_datasets import load_pdbbind_grid

def benchmark_loading_datasets(hyper_parameters, 
                               dataset='tox21', model='tf', split=None,
@@ -74,7 +75,7 @@ def benchmark_loading_datasets(hyper_parameters,
  
  if dataset in ['muv', 'pcba', 'tox21', 'sider', 'toxcast']:
    mode = 'classification'
  elif dataset in ['kaggle', 'delaney', 'nci']:
  elif dataset in ['kaggle', 'delaney', 'nci','pdbbind']:
    mode = 'regression'
  else:
    raise ValueError('Dataset not supported')
@@ -91,7 +92,19 @@ def benchmark_loading_datasets(hyper_parameters,
  
  if dataset in ['kaggle']:
    featurizer = None #kaggle dataset use its own features
    if split in ['random', 'scaffold']:
      return
    else:
      split = None #kaggle dataset is already splitted
    if not model in ['tf_regression']:
      return

  if dataset in ['pdbbind']:
    featurizer = 'grid' #pdbbind use grid featurizer
    if split in ['scaffold', 'index']:
      return #skip the scaffold and index splitting of pdbbind
    if not model in ['tf_regression']:
      return
  
  if not split in [None, 'index','random','scaffold']:
    raise ValueError('Splitter function not supported')
@@ -99,7 +112,8 @@ def benchmark_loading_datasets(hyper_parameters,
  loading_functions = {'tox21': load_tox21, 'muv': load_muv,
                       'pcba': load_pcba, 'nci': load_nci,
                       'sider': load_sider, 'toxcast': load_toxcast,
                       'kaggle': load_kaggle, 'delaney': load_delaney}
                       'kaggle': load_kaggle, 'delaney': load_delaney,
                       'pdbbind': load_pdbbind_grid}
  
  print('-------------------------------------')
  print('Benchmark %s on dataset: %s' % (model, dataset))
@@ -117,7 +131,7 @@ def benchmark_loading_datasets(hyper_parameters,
  train_dataset, valid_dataset, test_dataset = all_dataset
  time_finish_loading = time.time()
  #time_finish_loading-time_start is the time(s) used for dataset loading
  if dataset in ['kaggle']:
  if dataset in ['kaggle','pdbbind']:
    n_features = train_dataset.get_data_shape()[0]
    #kaggle dataset has customized features
    
@@ -497,7 +511,7 @@ if __name__ == '__main__':
           'tf_regression, graphconvreg')
  parser.add_argument('-d', action='append', dest='dataset_args', default=[], 
      help='Choice of dataset: tox21, sider, muv, toxcast, pcba, ' + 
           'kaggle, delaney, nci')
           'kaggle, delaney, nci, pdbbind')
  args = parser.parse_args()
  #Datasets and models used in the benchmark test
  splitters = args.splitter_args
@@ -511,7 +525,7 @@ if __name__ == '__main__':
              'tf_regression', 'graphconvreg']
  if len(datasets) == 0:
    datasets = ['tox21', 'sider', 'muv', 'toxcast', 'pcba', 
                'delaney', 'nci', 'kaggle']
                'delaney', 'nci', 'kaggle', 'pdbbind']

  #input hyperparameters
  #tf: dropouts, learning rate, layer_sizes, weight initial stddev,penalty,
@@ -562,8 +576,6 @@ if __name__ == '__main__':
            benchmark_loading_datasets(
                hps, dataset=dataset, model=model, split=split, out_path='.')
      else:
        if dataset in ['kaggle']:
          datasets.remove('kaggle') #kaggle only needs to be run once
        for model in models:
          if model in ['tf_regression', 'graphconvreg']:
            benchmark_loading_datasets(
+42 −0
Original line number Diff line number Diff line
"""
gdb7 dataset loader.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import shutil
import deepchem as dc

def load_gdb7(featurizer=None, split='index'):
  """Load gdb7 datasets."""
  # Featurize gdb7 dataset
  print("About to featurize gdb7 dataset.")
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(
      current_dir, "./gdb7.sdf")
  gdb7_tasks = ["u0_atom"]
  if featurizer is None:
    featurizer = dc.feat.CoulombMatrixEig(23)
  else:
    raise ValueError('Only support Coulomb Matrix featurizer')
  loader = dc.data.SDFLoader(tasks=gdb7_tasks, smiles_field="smiles", 
                             mol_field="mol", featurizer=featurizer)
  dataset = loader.featurize(dataset_file)

  # Initialize transformers 
  transformers = [
      dc.trans.NormalizationTransformer(transform_X=True, dataset=dataset),
      dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)]

  print("About to transform data")
  for transformer in transformers:
      dataset = transformer.transform(dataset)

  splitters = {'index': dc.splits.IndexSplitter(),
               'random': dc.splits.RandomSplitter()}
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
  return gdb7_tasks, (train, valid, test), transformers
+37 −0
Original line number Diff line number Diff line
"""
Script that trains Tensorflow singletask models on GDB7 dataset.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import deepchem as dc
import numpy as np
from gdb7_datasets import load_gdb7

np.random.seed(123)

gdb7_tasks, datasets, transformers = load_gdb7()
train_dataset, valid_dataset, test_dataset = datasets

regression_metric = dc.metrics.Metric(dc.metrics.mean_absolute_error, 
                                      mode="regression")
model = dc.models.TensorflowMultiTaskRegressor(
    n_tasks=len(gdb7_tasks), n_features=23,
    learning_rate=.001, momentum=.8, batch_size=512,
    weight_init_stddevs=[1/np.sqrt(2000),1/np.sqrt(800),1/np.sqrt(800),1/np.sqrt(1000)],
    bias_init_consts=[0.,0.,0.,0.], layer_sizes=[2000,800,800,1000], 
    dropouts=[0.1,0.1,0.1,0.1])

# Fit trained model
model.fit(train_dataset)
model.save()

train_scores = model.evaluate(train_dataset, [regression_metric], transformers)
print("Train scores [kcal/mol]")
print(train_scores)

valid_scores = model.evaluate(valid_dataset, [regression_metric], transformers)
print("Validation scores [kcal/mol]")
print(valid_scores)
Loading