Merge pull request #383 from calebgeniesse/clinical_trials (31aaf9c2) · Commits · 钟慕尧 / deepchem

README.md

+13 −0

Original line number	Diff line number	Diff line
		@@ -212,6 +212,10 @@ Index splitting
		\| \|Multitask network \|0.830 \|0.678 \|
		\| \|robust MT-NN \|0.825 \|0.680 \|
		\| \|graph convolution \|0.821 \|0.720 \|
		\|clintox \|logistic regression \|0.967 \|0.676 \|
		\| \|Multitask network \|0.934 \|0.830 \|
		\| \|robust MT-NN \|0.949 \|0.827 \|
		\| \|graph convolution \|0.946 \|0.860 \|

		Random splitting

		@@ -237,6 +241,10 @@ Random splitting
		\| \|Multitask network \|0.836 \|0.684 \|
		\| \|robust MT-NN \|0.822 \|0.681 \|
		\| \|graph convolution \|0.820 \|0.717 \|
		\|clintox \|logistic regression \|0.972 \|0.725 \|
		\| \|Multitask network \|0.951 \|0.834 \|
		\| \|robust MT-NN \|0.959 \|0.830 \|
		\| \|graph convolution \|0.975 \|0.876 \|

		Scaffold splitting

		@@ -262,6 +270,10 @@ Scaffold splitting
		\| \|Multitask network \|0.828 \|0.617 \|
		\| \|robust MT-NN \|0.830 \|0.614 \|
		\| \|graph convolution \|0.832 \|0.638 \|
		\|clintox \|logistic regression \|0.960 \|0.803 \|
		\| \|Multitask network \|0.947 \|0.862 \|
		\| \|robust MT-NN \|0.953 \|0.890 \|
		\| \|graph convolution \|0.957 \|0.823 \|

		* Regression

		@@ -313,6 +325,7 @@ Number of tasks and examples in the datasets
		\|pcba \|128 \|439863 \|
		\|sider \|27 \|1427 \|
		\|toxcast \|617 \|8615 \|
		\|clintox \|2 \|1491 \|
		\|delaney \|1 \|1128 \|
		\|sampl \|1 \|643 \|
		\|kaggle \|15 \|173065 \|

examples/benchmark.py

+7 −6

Original line number	Diff line number	Diff line
		@@ -53,6 +53,7 @@ from pdbbind.pdbbind_datasets import load_pdbbind_grid
		from chembl.chembl_datasets import load_chembl
		from gdb7.gdb7_datasets import load_gdb7
		from sampl.sampl_datasets import load_sampl
		from clintox.clintox_datasets import load_clintox

		def benchmark_loading_datasets(hyper_parameters,
		dataset='tox21', model='tf', split=None,
		@@ -66,7 +67,7 @@ def benchmark_loading_datasets(hyper_parameters,
		hyper parameters including dropout rate, learning rate, etc.
		dataset: string, optional (default='tox21')
		choice of which dataset to use, should be: tox21, muv, sider,
		toxcast, pcba, delaney, kaggle, nci
		toxcast, pcba, delaney, kaggle, nci, clintox
		model: string, optional (default='tf')
		choice of which model to use, should be: rf, tf, tf_robust, logreg,
		graphconv, tf_regression, graphconvreg
		@@ -76,7 +77,7 @@ def benchmark_loading_datasets(hyper_parameters,
		path of result file
		"""

		if dataset in ['muv', 'pcba', 'tox21', 'sider', 'toxcast']:
		if dataset in ['muv', 'pcba', 'tox21', 'sider', 'toxcast', 'clintox']:
		mode = 'classification'
		elif dataset in ['kaggle', 'delaney', 'nci', 'pdbbind', 'chembl',
		'gdb7', 'sampl']:
		@@ -133,7 +134,7 @@ def benchmark_loading_datasets(hyper_parameters,
		'kaggle': load_kaggle, 'delaney': load_delaney,
		'pdbbind': load_pdbbind_grid,
		'chembl': load_chembl, 'gdb7': load_gdb7,
		'sampl': load_sampl}
		'sampl': load_sampl, 'clintox': load_clintox}

		print('-------------------------------------')
		print('Benchmark %s on dataset: %s' % (model, dataset))
		@@ -545,7 +546,7 @@ if __name__ == '__main__':
		'tf_regression, graphconvreg')
		parser.add_argument('-d', action='append', dest='dataset_args', default=[],
		help='Choice of dataset: tox21, sider, muv, toxcast, pcba, ' +
		'kaggle, delaney, nci, pdbbindi, chembl, gdb7')
		'kaggle, delaney, nci, pdbbindi, chembl, gdb7, clintox')
		args = parser.parse_args()
		#Datasets and models used in the benchmark test
		splitters = args.splitter_args
		@@ -558,7 +559,7 @@ if __name__ == '__main__':
		models = ['tf', 'tf_robust', 'logreg', 'graphconv',
		'tf_regression', 'graphconvreg']
		if len(datasets) == 0:
		datasets = ['tox21', 'sider', 'muv', 'toxcast', 'pcba',
		datasets = ['tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox',
		'delaney', 'nci', 'kaggle', 'pdbbind', 'chembl', 'gdb7']

		#input hyperparameters
		@@ -604,7 +605,7 @@ if __name__ == '__main__':

		for split in splitters:
		for dataset in datasets:
		if dataset in ['tox21', 'sider', 'muv', 'toxcast', 'pcba']:
		if dataset in ['tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox']:
		for model in models:
		if model in ['tf', 'tf_robust', 'logreg', 'graphconv']:
		benchmark_loading_datasets(

examples/clintox/init.py

0 → 100644

+0 −0

Empty file added.

examples/clintox/clintox_datasets.py

0 → 100644

+52 −0

Original line number	Diff line number	Diff line
		"""
		Clinical Toxicity (clintox) dataset loader.
		@author Caleb Geniesse
		"""

		from __future__ import print_function
		from __future__ import division
		from __future__ import unicode_literals

		import os
		import deepchem as dc


		def load_clintox(featurizer='ECFP', split='index'):
		"""Load clintox datasets."""

		# Load clintox dataset
		print("About to load clintox dataset.")
		current_dir = os.path.dirname(os.path.realpath(__file__))
		dataset_file = os.path.join(
		current_dir, "./datasets/clintox.csv.gz")
		dataset = dc.utils.save.load_from_disk(dataset_file)
		clintox_tasks = dataset.columns.values[1:].tolist()
		print("Tasks in dataset: %s" % (clintox_tasks))
		print("Number of tasks in dataset: %s" % str(len(clintox_tasks)))
		print("Number of examples in dataset: %s" % str(dataset.shape[0]))

		# Featurize clintox dataset
		print("About to featurize clintox dataset.")
		featurizers = {'ECFP': dc.feat.CircularFingerprint(size=1024),
		'GraphConv': dc.feat.ConvMolFeaturizer()}
		featurizer = featurizers[featurizer]
		loader = dc.data.CSVLoader(
		tasks=clintox_tasks, smiles_field="smiles", featurizer=featurizer)
		dataset = loader.featurize(dataset_file, shard_size=8192)

		# Transform clintox dataset
		print("About to transform clintox dataset.")
		transformers = [
		dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]
		for transformer in transformers:
		dataset = transformer.transform(dataset)

		# Split clintox dataset
		print("About to split clintox dataset.")
		splitters = {'index': dc.splits.IndexSplitter(),
		'random': dc.splits.RandomSplitter(),
		'scaffold': dc.splits.ScaffoldSplitter()}
		splitter = splitters[split]
		train, valid, test = splitter.train_valid_test_split(dataset)

		return clintox_tasks, (train, valid, test), transformers

examples/clintox/clintox_graph_conv.py

0 → 100644

+72 −0

Original line number	Diff line number	Diff line
		"""
		Script that trains graph-conv models on clintox dataset.
		@author Caleb Geniesse
		"""
		from __future__ import print_function
		from __future__ import division
		from __future__ import unicode_literals

		import numpy as np
		import tensorflow as tf
		from keras import backend as K

		import deepchem as dc
		from clintox_datasets import load_clintox

		# Only for debug!
		np.random.seed(123)

		g = tf.Graph()
		sess = tf.Session(graph=g)
		K.set_session(sess)

		with g.as_default():
		# Load clintox dataset
		n_features = 1024
		clintox_tasks, clintox_datasets, transformers = load_clintox(
		featurizer='GraphConv', split='random')
		train_dataset, valid_dataset, test_dataset = clintox_datasets

		# Fit models
		metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean,
		mode="classification")

		# Do setup required for tf/keras models
		# Number of features on conv-mols
		n_feat = 75
		# Batch size of models
		batch_size = 50
		graph_model = dc.nn.SequentialGraph(n_feat)
		graph_model.add(dc.nn.GraphConv(64, activation='relu'))
		graph_model.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
		graph_model.add(dc.nn.GraphPool())
		graph_model.add(dc.nn.GraphConv(64, activation='relu'))
		graph_model.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
		graph_model.add(dc.nn.GraphPool())
		# Gather Projection
		graph_model.add(dc.nn.Dense(128, activation='relu'))
		graph_model.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
		graph_model.add(dc.nn.GraphGather(batch_size, activation="tanh"))
		# Dense post-processing layer

		with tf.Session() as sess:
		model = dc.models.MultitaskGraphClassifier(sess, graph_model,
		len(clintox_tasks),
		batch_size=batch_size,
		learning_rate=1e-3,
		learning_rate_decay_time=1000,
		optimizer_type="adam",
		beta1=.9, beta2=.999)

		# Fit trained model
		model.fit(train_dataset, nb_epoch=10)

		print("Evaluating model")
		train_scores = model.evaluate(train_dataset, [metric], transformers)
		valid_scores = model.evaluate(valid_dataset, [metric], transformers)

		print("Train scores")
		print(train_scores)

		print("Validation scores")
		print(valid_scores)

Original line number	Diff line number	Diff line
		@@ -212,6 +212,10 @@ Index splitting
		\| \|Multitask network \|0.830 \|0.678 \|
		\| \|robust MT-NN \|0.825 \|0.680 \|
		\| \|graph convolution \|0.821 \|0.720 \|
		\|clintox \|logistic regression \|0.967 \|0.676 \|
		\| \|Multitask network \|0.934 \|0.830 \|
		\| \|robust MT-NN \|0.949 \|0.827 \|
		\| \|graph convolution \|0.946 \|0.860 \|

		Random splitting

		@@ -237,6 +241,10 @@ Random splitting
		\| \|Multitask network \|0.836 \|0.684 \|
		\| \|robust MT-NN \|0.822 \|0.681 \|
		\| \|graph convolution \|0.820 \|0.717 \|
		\|clintox \|logistic regression \|0.972 \|0.725 \|
		\| \|Multitask network \|0.951 \|0.834 \|
		\| \|robust MT-NN \|0.959 \|0.830 \|
		\| \|graph convolution \|0.975 \|0.876 \|

		Scaffold splitting

		@@ -262,6 +270,10 @@ Scaffold splitting
		\| \|Multitask network \|0.828 \|0.617 \|
		\| \|robust MT-NN \|0.830 \|0.614 \|
		\| \|graph convolution \|0.832 \|0.638 \|
		\|clintox \|logistic regression \|0.960 \|0.803 \|
		\| \|Multitask network \|0.947 \|0.862 \|
		\| \|robust MT-NN \|0.953 \|0.890 \|
		\| \|graph convolution \|0.957 \|0.823 \|

		* Regression

		@@ -313,6 +325,7 @@ Number of tasks and examples in the datasets
		\|pcba \|128 \|439863 \|
		\|sider \|27 \|1427 \|
		\|toxcast \|617 \|8615 \|
		\|clintox \|2 \|1491 \|
		\|delaney \|1 \|1128 \|
		\|sampl \|1 \|643 \|
		\|kaggle \|15 \|173065 \|

Admin message