SAMPL performance (5244ca0b) · Commits · 钟慕尧 / deepchem

README.md

+15 −6

Original line number	Diff line number	Diff line
		@@ -245,12 +245,18 @@ Scaffold splitting

		\|Dataset \|Model \|Splitting \|Train score/R2\|Valid score/R2\|
		\|----------------\|--------------------\|------------\|--------------\|--------------\|
		\|delaney \|MT-NN regression \|Index \|0.773 \|0.574 \|
		\| \|graphconv regression\|Index \|0.991 \|0.825 \|
		\| \|MT-NN regression \|Random \|0.769 \|0.591 \|
		\| \|graphconv regression\|Random \|0.996 \|0.873 \|
		\| \|MT-NN regression \|Scaffold \|0.782 \|0.426 \|
		\| \|graphconv regression\|Scaffold \|0.994 \|0.606 \|
		\|delaney \|MT-NN regression \|Index \|0.868 \|0.578 \|
		\| \|graphconv regression\|Index \|0.967 \|0.790 \|
		\| \|MT-NN regression \|Random \|0.865 \|0.574 \|
		\| \|graphconv regression\|Random \|0.964 \|0.782 \|
		\| \|MT-NN regression \|Scaffold \|0.866 \|0.342 \|
		\| \|graphconv regression\|Scaffold \|0.967 \|0.606 \|
		\|sampl \|MT-NN regression \|Index \|0.917 \|0.764 \|
		\| \|graphconv regression\|Index \|0.982 \|0.864 \|
		\| \|MT-NN regression \|Random \|0.908 \|0.830 \|
		\| \|graphconv regression\|Random \|0.987 \|0.868 \|
		\| \|MT-NN regression \|Scaffold \|0.891 \|0.217 \|
		\| \|graphconv regression\|Scaffold \|0.985 \|0.666 \|
		\|nci \|MT-NN regression \|Index \|0.171 \|0.062 \|
		\| \|graphconv regression\|Index \|0.123 \|0.048 \|
		\| \|MT-NN regression \|Random \|0.168 \|0.085 \|
		@@ -286,6 +292,7 @@ Number of tasks and examples in the datasets
		\|sider \|27 \|1427 \|
		\|toxcast \|617 \|8615 \|
		\|delaney \|1 \|1128 \|
		\|sampl \|1 \|643 \|
		\|kaggle \|15 \|173065 \|
		\|nci \|60 \|19127 \|
		\|pdbbind(core) \|1 \|195 \|
		@@ -322,6 +329,8 @@ Time needed for benchmark test(~20h in total)
		\| \|graph convolution \|80 \|900 \|
		\|delaney \|MT-NN regression \|10 \|40 \|
		\| \|graphconv regression\|10 \|40 \|
		\|sampl \|MT-NN regression \|10 \|30 \|
		\| \|graphconv regression\|10 \|40 \|
		\|nci \|MT-NN regression \|400 \|1200 \|
		\| \|graphconv regression\|400 \|2500 \|
		\|pdbbind(core) \|MT-NN regression \|0(featurized) \|30 \|

examples/benchmark.py

+6 −5

Original line number	Diff line number	Diff line
		@@ -52,6 +52,7 @@ from nci.nci_datasets import load_nci
		from pdbbind.pdbbind_datasets import load_pdbbind_grid
		from chembl.chembl_datasets import load_chembl
		from gdb7.gdb7_datasets import load_gdb7
		from sampl.sampl_datasets import load_sampl

		def benchmark_loading_datasets(hyper_parameters,
		dataset='tox21', model='tf', split=None,
		@@ -77,7 +78,8 @@ def benchmark_loading_datasets(hyper_parameters,

		if dataset in ['muv', 'pcba', 'tox21', 'sider', 'toxcast']:
		mode = 'classification'
		elif dataset in ['kaggle', 'delaney', 'nci', 'pdbbind', 'chembl', 'gdb7']:
		elif dataset in ['kaggle', 'delaney', 'nci', 'pdbbind', 'chembl',
		'gdb7', 'sampl']:
		mode = 'regression'
		else:
		raise ValueError('Dataset not supported')
		@@ -130,7 +132,8 @@ def benchmark_loading_datasets(hyper_parameters,
		'sider': load_sider, 'toxcast': load_toxcast,
		'kaggle': load_kaggle, 'delaney': load_delaney,
		'pdbbind': load_pdbbind_grid,
		'chembl': load_chembl, 'gdb7': load_gdb7}
		'chembl': load_chembl, 'gdb7': load_gdb7,
		'sampl': load_sampl}

		print('-------------------------------------')
		print('Benchmark %s on dataset: %s' % (model, dataset))
		@@ -163,8 +166,6 @@ def benchmark_loading_datasets(hyper_parameters,
		model=model)
		elif mode == 'regression':
		metric = 'r2'
		if dataset in ['gdb7']:
		metric = 'mae'
		train_score, valid_score = benchmark_regression(
		train_dataset, valid_dataset, tasks,
		transformers, hp, n_features, metric=metric,
		@@ -594,7 +595,7 @@ if __name__ == '__main__':
		'dropouts': [0.25, 0.25],
		'penalty': 0.0005, 'penalty_type': 'l2',
		'batch_size': 128, 'nb_epoch': 50,
		'learning_rate': 0.00008}]
		'learning_rate': 0.0008}]

		hps['graphconvreg'] = [{'batch_size': 128, 'nb_epoch': 20,
		'learning_rate': 0.0005, 'n_filters': 128,

examples/SAMPL/SAMPL.csv→examples/sampl/SAMPL.csv

+0 −0

File moved.

View file

examples/sampl/init.py

0 → 100644

+0 −0

Empty file added.

examples/SAMPL/SAMPL_datasets.py→examples/sampl/sampl_datasets.py

+2 −2

Original line number	Diff line number	Diff line
		@@ -10,14 +10,14 @@ import numpy as np
		import shutil
		import deepchem as dc

		def load_SAMPL(featurizer='ECFP', split='index'):
		def load_sampl(featurizer='ECFP', split='index'):
		"""Load SAMPL datasets."""
		# Featurize SAMPL dataset
		print("About to featurize SAMPL dataset.")
		current_dir = os.path.dirname(os.path.realpath(__file__))
		dataset_file = os.path.join(
		current_dir, "./SAMPL.csv")
		SAMPL_tasks = ['expt', 'calc']
		SAMPL_tasks = ['expt']
		if featurizer == 'ECFP':
		featurizer = dc.feat.CircularFingerprint(size=1024)
		elif featurizer == 'GraphConv':

Original line number	Diff line number	Diff line
		@@ -245,12 +245,18 @@ Scaffold splitting

		\|Dataset \|Model \|Splitting \|Train score/R2\|Valid score/R2\|
		\|----------------\|--------------------\|------------\|--------------\|--------------\|
		\|delaney \|MT-NN regression \|Index \|0.773 \|0.574 \|
		\| \|graphconv regression\|Index \|0.991 \|0.825 \|
		\| \|MT-NN regression \|Random \|0.769 \|0.591 \|
		\| \|graphconv regression\|Random \|0.996 \|0.873 \|
		\| \|MT-NN regression \|Scaffold \|0.782 \|0.426 \|
		\| \|graphconv regression\|Scaffold \|0.994 \|0.606 \|
		\|delaney \|MT-NN regression \|Index \|0.868 \|0.578 \|
		\| \|graphconv regression\|Index \|0.967 \|0.790 \|
		\| \|MT-NN regression \|Random \|0.865 \|0.574 \|
		\| \|graphconv regression\|Random \|0.964 \|0.782 \|
		\| \|MT-NN regression \|Scaffold \|0.866 \|0.342 \|
		\| \|graphconv regression\|Scaffold \|0.967 \|0.606 \|
		\|sampl \|MT-NN regression \|Index \|0.917 \|0.764 \|
		\| \|graphconv regression\|Index \|0.982 \|0.864 \|
		\| \|MT-NN regression \|Random \|0.908 \|0.830 \|
		\| \|graphconv regression\|Random \|0.987 \|0.868 \|
		\| \|MT-NN regression \|Scaffold \|0.891 \|0.217 \|
		\| \|graphconv regression\|Scaffold \|0.985 \|0.666 \|
		\|nci \|MT-NN regression \|Index \|0.171 \|0.062 \|
		\| \|graphconv regression\|Index \|0.123 \|0.048 \|
		\| \|MT-NN regression \|Random \|0.168 \|0.085 \|
		@@ -286,6 +292,7 @@ Number of tasks and examples in the datasets
		\|sider \|27 \|1427 \|
		\|toxcast \|617 \|8615 \|
		\|delaney \|1 \|1128 \|
		\|sampl \|1 \|643 \|
		\|kaggle \|15 \|173065 \|
		\|nci \|60 \|19127 \|
		\|pdbbind(core) \|1 \|195 \|
		@@ -322,6 +329,8 @@ Time needed for benchmark test(~20h in total)
		\| \|graph convolution \|80 \|900 \|
		\|delaney \|MT-NN regression \|10 \|40 \|
		\| \|graphconv regression\|10 \|40 \|
		\|sampl \|MT-NN regression \|10 \|30 \|
		\| \|graphconv regression\|10 \|40 \|
		\|nci \|MT-NN regression \|400 \|1200 \|
		\| \|graphconv regression\|400 \|2500 \|
		\|pdbbind(core) \|MT-NN regression \|0(featurized) \|30 \|

Admin message