Merge branch 'master' of https://github.com/deepchem/deepchem into gdb7_update (1d0f96e0) · Commits · 钟慕尧 / deepchem

README.md

+15 −1

Original line number	Diff line number	Diff line
		@@ -260,8 +260,18 @@ Scaffold splitting
		\|pdbbind(core) \|MT-NN regression \|Random \|0.973 \|0.494 \|
		\|pdbbind(refined)\|MT-NN regression \|Random \|0.987 \|0.503 \|
		\|pdbbind(full) \|MT-NN regression \|Random \|0.983 \|0.528 \|
		\|chembl \|MT-NN regression \|Index \|0.443 \|0.427 \|
		\| \|MT-NN regression \|Random \|0.464 \|0.434 \|
		\| \|MT-NN regression \|Scaffold \|0.484 \|0.361 \|
		\|gdb7 \|MT-NN regression \|Index \|0.961 \|0.011 \|
		\| \|MT-NN regression \|Random \|0.742 \|0.732 \|
		\|kaggle \|MT-NN regression \|User-defined\|0.748 \|0.452 \|

		\|Dataset \|Model \|Splitting \|Train score/MAE(kcal/mol)\|Valid score/MAE(kcal/mol)\|
		\|----------------\|--------------------\|------------\|-------------------------\|-------------------------\|
		\|gdb7 \|MT-NN regression \|Index \|44.5 \|185.6 \|
		\| \|MT-NN regression \|Random \|86.1 \|92.2 \|

		* General features

		Number of tasks and examples in the datasets
		@@ -279,6 +289,8 @@ Number of tasks and examples in the datasets
		\|pdbbind(core) \|1 \|195 \|
		\|pdbbind(refined)\|1 \|3706 \|
		\|pdbbind(full) \|1 \|11908 \|
		\|chembl(5thresh) \|691 \|23871 \|
		\|gdb7 \|1 \|7165 \|



		@@ -313,6 +325,8 @@ Time needed for benchmark test(~20h in total)
		\|pdbbind(core) \|MT-NN regression \|0(featurized) \|30 \|
		\|pdbbind(refined)\|MT-NN regression \|0(featurized) \|40 \|
		\|pdbbind(full) \|MT-NN regression \|0(featurized) \|60 \|
		\|chembl \|MT-NN regression \|200 \|9000 \|
		\|gdb7 \|MT-NN regression \|10 \|110 \|
		\|kaggle \|MT-NN regression \|2200 \|3200 \|

+1 −0

Original line number	Diff line number	Diff line
		@@ -10,5 +10,6 @@ from deepchem.splits.splitters import *
		from deepchem.splits.splitters import ScaffoldSplitter
		from deepchem.splits.splitters import SpecifiedSplitter
		from deepchem.splits.splitters import IndexSplitter
		from deepchem.splits.splitters import IndiceSplitter
		from deepchem.splits.task_splitter import merge_fold_datasets
		from deepchem.splits.task_splitter import TaskSplitter

+39 −0

Original line number	Diff line number	Diff line
		@@ -327,6 +327,45 @@ class IndexSplitter(Splitter):
		return (indices[:train_cutoff], indices[train_cutoff:valid_cutoff],
		indices[valid_cutoff:])

		class IndiceSplitter(Splitter):
		"""
		Class for splits based on input order.
		"""
		def __init__(self, verbose=False, valid_indices=None, test_indices=None):
		"""
		Parameters
		-----------
		valid_indices: list of int
		indices of samples in the valid set
		test_indices: list of int
		indices of samples in the test set
		"""
		self.verbose = verbose
		self.valid_indices = valid_indices
		self.test_indices = test_indices

		def split(self, dataset, seed=None, frac_train=.8, frac_valid=.1,
		frac_test=.1, log_every_n=None):
		"""
		Splits internal compounds into train/validation/test in designated order.
		"""
		num_datapoints = len(dataset)
		indices = np.arange(num_datapoints).tolist()
		if self.valid_indices is None:
		self.valid_indices = []
		else:
		for indice in indices:
		if indice in self.valid_indices:
		indices.remove(indice)
		if self.test_indices is None:
		self.test_indices = []
		else:
		for indice in indices:
		if indice in self.valid_indices:
		indices.remove(indice)

		return (indices, self.valid_indices, self.test_indices)


		class ScaffoldSplitter(Splitter):
		"""

+1 −0

Original line number	Diff line number	Diff line
		@@ -104,6 +104,7 @@ class NormalizationTransformer(Transformer):
		y_means, y_stds = dataset.get_statistics(X_stats=False, y_stats=True)
		self.y_means = y_means
		# Control for pathological case with no variance.
		y_stds = np.array(y_stds)
		y_stds[y_stds == 0] = 1.
		self.y_stds = y_stds
		self.transform_gradients = transform_gradients

+2 −0

Original line number	Diff line number	Diff line
		@@ -108,6 +108,8 @@ def load_from_disk(filename):
		except KeyError:
		# Try older joblib version for legacy files.
		return old_joblib.load(filename)
		except ValueError:
		return old_joblib.load(filename)
		elif os.path.splitext(name)[1] == ".csv":
		# First line of user-specified CSV must be header.
		df = pd.read_csv(filename, header=0)

Original line number	Diff line number	Diff line
		@@ -260,8 +260,18 @@ Scaffold splitting
		\|pdbbind(core) \|MT-NN regression \|Random \|0.973 \|0.494 \|
		\|pdbbind(refined)\|MT-NN regression \|Random \|0.987 \|0.503 \|
		\|pdbbind(full) \|MT-NN regression \|Random \|0.983 \|0.528 \|
		\|chembl \|MT-NN regression \|Index \|0.443 \|0.427 \|
		\| \|MT-NN regression \|Random \|0.464 \|0.434 \|
		\| \|MT-NN regression \|Scaffold \|0.484 \|0.361 \|
		\|gdb7 \|MT-NN regression \|Index \|0.961 \|0.011 \|
		\| \|MT-NN regression \|Random \|0.742 \|0.732 \|
		\|kaggle \|MT-NN regression \|User-defined\|0.748 \|0.452 \|

		\|Dataset \|Model \|Splitting \|Train score/MAE(kcal/mol)\|Valid score/MAE(kcal/mol)\|
		\|----------------\|--------------------\|------------\|-------------------------\|-------------------------\|
		\|gdb7 \|MT-NN regression \|Index \|44.5 \|185.6 \|
		\| \|MT-NN regression \|Random \|86.1 \|92.2 \|

		* General features

		Number of tasks and examples in the datasets
		@@ -279,6 +289,8 @@ Number of tasks and examples in the datasets
		\|pdbbind(core) \|1 \|195 \|
		\|pdbbind(refined)\|1 \|3706 \|
		\|pdbbind(full) \|1 \|11908 \|
		\|chembl(5thresh) \|691 \|23871 \|
		\|gdb7 \|1 \|7165 \|



		@@ -313,6 +325,8 @@ Time needed for benchmark test(~20h in total)
		\|pdbbind(core) \|MT-NN regression \|0(featurized) \|30 \|
		\|pdbbind(refined)\|MT-NN regression \|0(featurized) \|40 \|
		\|pdbbind(full) \|MT-NN regression \|0(featurized) \|60 \|
		\|chembl \|MT-NN regression \|200 \|9000 \|
		\|gdb7 \|MT-NN regression \|10 \|110 \|
		\|kaggle \|MT-NN regression \|2200 \|3200 \|