Merge pull request #382 from miaecle/gdb7_mod (cef2632f) · Commits · 钟慕尧 / deepchem

README.md

+15 −6

Original line number	Diff line number	Diff line
		@@ -245,12 +245,18 @@ Scaffold splitting

		\|Dataset \|Model \|Splitting \|Train score/R2\|Valid score/R2\|
		\|----------------\|--------------------\|------------\|--------------\|--------------\|
		\|delaney \|MT-NN regression \|Index \|0.773 \|0.574 \|
		\| \|graphconv regression\|Index \|0.991 \|0.825 \|
		\| \|MT-NN regression \|Random \|0.769 \|0.591 \|
		\| \|graphconv regression\|Random \|0.996 \|0.873 \|
		\| \|MT-NN regression \|Scaffold \|0.782 \|0.426 \|
		\| \|graphconv regression\|Scaffold \|0.994 \|0.606 \|
		\|delaney \|MT-NN regression \|Index \|0.868 \|0.578 \|
		\| \|graphconv regression\|Index \|0.967 \|0.790 \|
		\| \|MT-NN regression \|Random \|0.865 \|0.574 \|
		\| \|graphconv regression\|Random \|0.964 \|0.782 \|
		\| \|MT-NN regression \|Scaffold \|0.866 \|0.342 \|
		\| \|graphconv regression\|Scaffold \|0.967 \|0.606 \|
		\|sampl \|MT-NN regression \|Index \|0.917 \|0.764 \|
		\| \|graphconv regression\|Index \|0.982 \|0.864 \|
		\| \|MT-NN regression \|Random \|0.908 \|0.830 \|
		\| \|graphconv regression\|Random \|0.987 \|0.868 \|
		\| \|MT-NN regression \|Scaffold \|0.891 \|0.217 \|
		\| \|graphconv regression\|Scaffold \|0.985 \|0.666 \|
		\|nci \|MT-NN regression \|Index \|0.171 \|0.062 \|
		\| \|graphconv regression\|Index \|0.123 \|0.048 \|
		\| \|MT-NN regression \|Random \|0.168 \|0.085 \|
		@@ -286,6 +292,7 @@ Number of tasks and examples in the datasets
		\|sider \|27 \|1427 \|
		\|toxcast \|617 \|8615 \|
		\|delaney \|1 \|1128 \|
		\|sampl \|1 \|643 \|
		\|kaggle \|15 \|173065 \|
		\|nci \|60 \|19127 \|
		\|pdbbind(core) \|1 \|195 \|
		@@ -322,6 +329,8 @@ Time needed for benchmark test(~20h in total)
		\| \|graph convolution \|80 \|900 \|
		\|delaney \|MT-NN regression \|10 \|40 \|
		\| \|graphconv regression\|10 \|40 \|
		\|sampl \|MT-NN regression \|10 \|30 \|
		\| \|graphconv regression\|10 \|40 \|
		\|nci \|MT-NN regression \|400 \|1200 \|
		\| \|graphconv regression\|400 \|2500 \|
		\|pdbbind(core) \|MT-NN regression \|0(featurized) \|30 \|

deepchem/splits/splitters.py

+8 −10

Original line number	Diff line number	Diff line
		@@ -351,20 +351,18 @@ class IndiceSplitter(Splitter):
		"""
		num_datapoints = len(dataset)
		indices = np.arange(num_datapoints).tolist()
		train_indices = []
		if self.valid_indices is None:
		self.valid_indices = []
		else:
		for indice in indices:
		if indice in self.valid_indices:
		indices.remove(indice)
		if self.test_indices is None:
		self.test_indices = []
		else:
		valid_test = self.valid_indices
		valid_test.extend(self.test_indices)
		for indice in indices:
		if indice in self.valid_indices:
		indices.remove(indice)
		if not indice in valid_test:
		train_indices.append(indice)

		return (indices, self.valid_indices, self.test_indices)
		return (train_indices, self.valid_indices, self.test_indices)


		class ScaffoldSplitter(Splitter):

examples/benchmark.py

+6 −5

Original line number	Diff line number	Diff line
		@@ -52,6 +52,7 @@ from nci.nci_datasets import load_nci
		from pdbbind.pdbbind_datasets import load_pdbbind_grid
		from chembl.chembl_datasets import load_chembl
		from gdb7.gdb7_datasets import load_gdb7
		from sampl.sampl_datasets import load_sampl

		def benchmark_loading_datasets(hyper_parameters,
		dataset='tox21', model='tf', split=None,
		@@ -77,7 +78,8 @@ def benchmark_loading_datasets(hyper_parameters,

		if dataset in ['muv', 'pcba', 'tox21', 'sider', 'toxcast']:
		mode = 'classification'
		elif dataset in ['kaggle', 'delaney', 'nci', 'pdbbind', 'chembl', 'gdb7']:
		elif dataset in ['kaggle', 'delaney', 'nci', 'pdbbind', 'chembl',
		'gdb7', 'sampl']:
		mode = 'regression'
		else:
		raise ValueError('Dataset not supported')
		@@ -130,7 +132,8 @@ def benchmark_loading_datasets(hyper_parameters,
		'sider': load_sider, 'toxcast': load_toxcast,
		'kaggle': load_kaggle, 'delaney': load_delaney,
		'pdbbind': load_pdbbind_grid,
		'chembl': load_chembl, 'gdb7': load_gdb7}
		'chembl': load_chembl, 'gdb7': load_gdb7,
		'sampl': load_sampl}

		print('-------------------------------------')
		print('Benchmark %s on dataset: %s' % (model, dataset))
		@@ -163,8 +166,6 @@ def benchmark_loading_datasets(hyper_parameters,
		model=model)
		elif mode == 'regression':
		metric = 'r2'
		if dataset in ['gdb7']:
		metric = 'mae'
		train_score, valid_score = benchmark_regression(
		train_dataset, valid_dataset, tasks,
		transformers, hp, n_features, metric=metric,
		@@ -594,7 +595,7 @@ if __name__ == '__main__':
		'dropouts': [0.25, 0.25],
		'penalty': 0.0005, 'penalty_type': 'l2',
		'batch_size': 128, 'nb_epoch': 50,
		'learning_rate': 0.00008}]
		'learning_rate': 0.0008}]

		hps['graphconvreg'] = [{'batch_size': 128, 'nb_epoch': 20,
		'learning_rate': 0.0005, 'n_filters': 128,

examples/gdb7/gdb7_datasets.py

+2 −2

Original line number	Diff line number	Diff line
		@@ -17,7 +17,7 @@ def load_gdb7_from_mat(split=0):
		if not os.path.exists('qm7.mat'): os.system('wget http://www.quantum-machine.org/data/qm7.mat')
		dataset = scipy.io.loadmat('qm7.mat')

		P = dataset['P'][range(0,split)+range(split+1,5)].flatten()
		P = dataset['P'][list(range(0,split))+list(range(split+1,5))].flatten()
		X = dataset['X'][P]
		y = dataset['T'][0,P]
		w = np.ones_like(y)
		@@ -67,7 +67,7 @@ def load_gdb7(featurizer=None, split='random'):
		with open(split_file, 'r') as f:
		reader = csv.reader(f)
		for row in reader:
		row_int = (np.asarray(list(map(int, row)))-1).tolist()
		row_int = (np.asarray(list(map(int, row)))).tolist()
		split_indices.append(row_int)

examples/sampl/SAMPL.csv

0 → 100644

+644 −0

File added.

Preview size limit exceeded, changes collapsed.

Original line number	Diff line number	Diff line
		@@ -245,12 +245,18 @@ Scaffold splitting

		\|Dataset \|Model \|Splitting \|Train score/R2\|Valid score/R2\|
		\|----------------\|--------------------\|------------\|--------------\|--------------\|
		\|delaney \|MT-NN regression \|Index \|0.773 \|0.574 \|
		\| \|graphconv regression\|Index \|0.991 \|0.825 \|
		\| \|MT-NN regression \|Random \|0.769 \|0.591 \|
		\| \|graphconv regression\|Random \|0.996 \|0.873 \|
		\| \|MT-NN regression \|Scaffold \|0.782 \|0.426 \|
		\| \|graphconv regression\|Scaffold \|0.994 \|0.606 \|
		\|delaney \|MT-NN regression \|Index \|0.868 \|0.578 \|
		\| \|graphconv regression\|Index \|0.967 \|0.790 \|
		\| \|MT-NN regression \|Random \|0.865 \|0.574 \|
		\| \|graphconv regression\|Random \|0.964 \|0.782 \|
		\| \|MT-NN regression \|Scaffold \|0.866 \|0.342 \|
		\| \|graphconv regression\|Scaffold \|0.967 \|0.606 \|
		\|sampl \|MT-NN regression \|Index \|0.917 \|0.764 \|
		\| \|graphconv regression\|Index \|0.982 \|0.864 \|
		\| \|MT-NN regression \|Random \|0.908 \|0.830 \|
		\| \|graphconv regression\|Random \|0.987 \|0.868 \|
		\| \|MT-NN regression \|Scaffold \|0.891 \|0.217 \|
		\| \|graphconv regression\|Scaffold \|0.985 \|0.666 \|
		\|nci \|MT-NN regression \|Index \|0.171 \|0.062 \|
		\| \|graphconv regression\|Index \|0.123 \|0.048 \|
		\| \|MT-NN regression \|Random \|0.168 \|0.085 \|
		@@ -286,6 +292,7 @@ Number of tasks and examples in the datasets
		\|sider \|27 \|1427 \|
		\|toxcast \|617 \|8615 \|
		\|delaney \|1 \|1128 \|
		\|sampl \|1 \|643 \|
		\|kaggle \|15 \|173065 \|
		\|nci \|60 \|19127 \|
		\|pdbbind(core) \|1 \|195 \|
		@@ -322,6 +329,8 @@ Time needed for benchmark test(~20h in total)
		\| \|graph convolution \|80 \|900 \|
		\|delaney \|MT-NN regression \|10 \|40 \|
		\| \|graphconv regression\|10 \|40 \|
		\|sampl \|MT-NN regression \|10 \|30 \|
		\| \|graphconv regression\|10 \|40 \|
		\|nci \|MT-NN regression \|400 \|1200 \|
		\| \|graphconv regression\|400 \|2500 \|
		\|pdbbind(core) \|MT-NN regression \|0(featurized) \|30 \|

Admin message