Commit 31aaf9c2 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #383 from calebgeniesse/clinical_trials

clintox dataset
parents 73d71f26 10209c98
Loading
Loading
Loading
Loading
+13 −0
Original line number Diff line number Diff line
@@ -212,6 +212,10 @@ Index splitting
|           |Multitask network   |0.830              |0.678              |
|           |robust MT-NN        |0.825              |0.680              |
|           |graph convolution   |0.821              |0.720              |
|clintox    |logistic regression |0.967              |0.676              |
|           |Multitask network   |0.934              |0.830              |
|           |robust MT-NN        |0.949              |0.827              |
|           |graph convolution   |0.946              |0.860              |

Random splitting

@@ -237,6 +241,10 @@ Random splitting
|           |Multitask network   |0.836        	     |0.684              |
|           |robust MT-NN        |0.822              |0.681              |
|           |graph convolution   |0.820        	     |0.717              |
|clintox    |logistic regression |0.972              |0.725              |
|           |Multitask network   |0.951              |0.834              |
|           |robust MT-NN        |0.959              |0.830              |
|           |graph convolution   |0.975              |0.876              |

Scaffold splitting

@@ -262,6 +270,10 @@ Scaffold splitting
|           |Multitask network   |0.828              |0.617              |
|           |robust MT-NN        |0.830              |0.614              |
|           |graph convolution   |0.832              |0.638              |
|clintox    |logistic regression |0.960              |0.803              |
|           |Multitask network   |0.947              |0.862              |
|           |robust MT-NN        |0.953              |0.890              |
|           |graph convolution   |0.957              |0.823              |

* Regression

@@ -313,6 +325,7 @@ Number of tasks and examples in the datasets
|pcba            |128        |439863     |
|sider           |27         |1427       |
|toxcast         |617        |8615       |
|clintox         |2          |1491       |
|delaney         |1          |1128       |
|sampl           |1          |643        |
|kaggle          |15         |173065     |
+7 −6
Original line number Diff line number Diff line
@@ -53,6 +53,7 @@ from pdbbind.pdbbind_datasets import load_pdbbind_grid
from chembl.chembl_datasets import load_chembl
from gdb7.gdb7_datasets import load_gdb7
from sampl.sampl_datasets import load_sampl
from clintox.clintox_datasets import load_clintox

def benchmark_loading_datasets(hyper_parameters, 
                               dataset='tox21', model='tf', split=None,
@@ -66,7 +67,7 @@ def benchmark_loading_datasets(hyper_parameters,
      hyper parameters including dropout rate, learning rate, etc.
  dataset: string, optional (default='tox21')
      choice of which dataset to use, should be: tox21, muv, sider, 
      toxcast, pcba, delaney, kaggle, nci
      toxcast, pcba, delaney, kaggle, nci, clintox
  model: string,  optional (default='tf')
      choice of which model to use, should be: rf, tf, tf_robust, logreg,
      graphconv, tf_regression, graphconvreg
@@ -76,7 +77,7 @@ def benchmark_loading_datasets(hyper_parameters,
      path of result file
  """
  
  if dataset in ['muv', 'pcba', 'tox21', 'sider', 'toxcast']:
  if dataset in ['muv', 'pcba', 'tox21', 'sider', 'toxcast', 'clintox']:
    mode = 'classification'
  elif dataset in ['kaggle', 'delaney', 'nci', 'pdbbind', 'chembl', 
                   'gdb7', 'sampl']:
@@ -133,7 +134,7 @@ def benchmark_loading_datasets(hyper_parameters,
                       'kaggle': load_kaggle, 'delaney': load_delaney,
                       'pdbbind': load_pdbbind_grid,
                       'chembl': load_chembl, 'gdb7': load_gdb7,
                       'sampl': load_sampl}
                       'sampl': load_sampl, 'clintox': load_clintox}
  
  print('-------------------------------------')
  print('Benchmark %s on dataset: %s' % (model, dataset))
@@ -545,7 +546,7 @@ if __name__ == '__main__':
           'tf_regression, graphconvreg')
  parser.add_argument('-d', action='append', dest='dataset_args', default=[], 
      help='Choice of dataset: tox21, sider, muv, toxcast, pcba, ' + 
           'kaggle, delaney, nci, pdbbindi, chembl, gdb7')
           'kaggle, delaney, nci, pdbbindi, chembl, gdb7, clintox')
  args = parser.parse_args()
  #Datasets and models used in the benchmark test
  splitters = args.splitter_args
@@ -558,7 +559,7 @@ if __name__ == '__main__':
    models = ['tf', 'tf_robust', 'logreg', 'graphconv', 
              'tf_regression', 'graphconvreg']
  if len(datasets) == 0:
    datasets = ['tox21', 'sider', 'muv', 'toxcast', 'pcba', 
    datasets = ['tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox',
                'delaney', 'nci', 'kaggle', 'pdbbind', 'chembl', 'gdb7']

  #input hyperparameters
@@ -604,7 +605,7 @@ if __name__ == '__main__':

  for split in splitters:
    for dataset in datasets:
      if dataset in ['tox21', 'sider', 'muv', 'toxcast', 'pcba']:
      if dataset in ['tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox']:
        for model in models:
          if model in ['tf', 'tf_robust', 'logreg', 'graphconv']:
            benchmark_loading_datasets(
+0 −0

Empty file added.

+52 −0
Original line number Diff line number Diff line
"""
Clinical Toxicity (clintox) dataset loader.
@author Caleb Geniesse
"""

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import deepchem as dc


def load_clintox(featurizer='ECFP', split='index'):
  """Load clintox datasets."""

  # Load clintox dataset
  print("About to load clintox dataset.")
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(
      current_dir, "./datasets/clintox.csv.gz")
  dataset = dc.utils.save.load_from_disk(dataset_file)
  clintox_tasks = dataset.columns.values[1:].tolist()
  print("Tasks in dataset: %s" % (clintox_tasks))
  print("Number of tasks in dataset: %s" % str(len(clintox_tasks)))
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))

  # Featurize clintox dataset
  print("About to featurize clintox dataset.")
  featurizers = {'ECFP': dc.feat.CircularFingerprint(size=1024),
                 'GraphConv': dc.feat.ConvMolFeaturizer()}
  featurizer = featurizers[featurizer]
  loader = dc.data.CSVLoader(
      tasks=clintox_tasks, smiles_field="smiles", featurizer=featurizer)
  dataset = loader.featurize(dataset_file, shard_size=8192)

  # Transform clintox dataset
  print("About to transform clintox dataset.")
  transformers = [
      dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  # Split clintox dataset
  print("About to split clintox dataset.")
  splitters = {'index': dc.splits.IndexSplitter(),
               'random': dc.splits.RandomSplitter(),
               'scaffold': dc.splits.ScaffoldSplitter()}
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)

  return clintox_tasks, (train, valid, test), transformers
+72 −0
Original line number Diff line number Diff line
"""
Script that trains graph-conv models on clintox dataset.
@author Caleb Geniesse
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import numpy as np
import tensorflow as tf
from keras import backend as K

import deepchem as dc
from clintox_datasets import load_clintox

# Only for debug!
np.random.seed(123)

g = tf.Graph()
sess = tf.Session(graph=g)
K.set_session(sess)

with g.as_default():
  # Load clintox dataset
  n_features = 1024
  clintox_tasks, clintox_datasets, transformers = load_clintox(
      featurizer='GraphConv', split='random')
  train_dataset, valid_dataset, test_dataset = clintox_datasets

  # Fit models
  metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean,
                             mode="classification")

  # Do setup required for tf/keras models
  # Number of features on conv-mols
  n_feat = 75
  # Batch size of models
  batch_size = 50
  graph_model = dc.nn.SequentialGraph(n_feat)
  graph_model.add(dc.nn.GraphConv(64, activation='relu'))
  graph_model.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
  graph_model.add(dc.nn.GraphPool())
  graph_model.add(dc.nn.GraphConv(64, activation='relu'))
  graph_model.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
  graph_model.add(dc.nn.GraphPool())
  # Gather Projection
  graph_model.add(dc.nn.Dense(128, activation='relu'))
  graph_model.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
  graph_model.add(dc.nn.GraphGather(batch_size, activation="tanh"))
  # Dense post-processing layer

  with tf.Session() as sess:
    model = dc.models.MultitaskGraphClassifier(sess, graph_model,
                                               len(clintox_tasks),
                                               batch_size=batch_size,
                                               learning_rate=1e-3,
                                               learning_rate_decay_time=1000,
                                               optimizer_type="adam",
                                               beta1=.9, beta2=.999)

    # Fit trained model
    model.fit(train_dataset, nb_epoch=10)

    print("Evaluating model")
    train_scores = model.evaluate(train_dataset, [metric], transformers)
    valid_scores = model.evaluate(valid_dataset, [metric], transformers)

    print("Train scores")
    print(train_scores)

    print("Validation scores")
    print(valid_scores)
Loading