Merge pull request #336 from miaecle/temp (c0a17207) · Commits · 钟慕尧 / deepchem

README.md

+29 −23

Original line number	Diff line number	Diff line
		@@ -195,26 +195,26 @@ Random splitting

		\|Dataset \|Model \|Train score/ROC-AUC\|Valid score/ROC-AUC\|
		\|-----------\|--------------------\|-------------------\|-------------------\|
		\|tox21 \|logistic regression \|0.903 \|0.741 \|
		\| \|Multitask network \|0.846 \|0.812 \|
		\| \|robust MT-NN \|0.844 \|0.793 \|
		\| \|graph convolution \|0.872 \|0.816 \|
		\|muv \|logistic regression \|0.961 \|0.696 \|
		\| \|Multitask network \|0.895 \|0.740 \|
		\| \|robust MT-NN \|0.914 \|0.667 \|
		\| \|graph convolution \|0.846 \|0.776 \|
		\|pcba \|logistic regression \|0.807 \|0.772 \|
		\| \|Multitask network \|0.811 \|0.787 \|
		\| \|robust MT-NN \|0.809 \|0.778 \|
		\| \|graph convolution \|0.875 \|0.844 \|
		\|sider \|logistic regression \|0.932 \|0.628 \|
		\| \|Multitask network \|0.779 \|0.665 \|
		\| \|robust MT-NN \|0.761 \|0.621 \|
		\| \|graph convolution \|0.706 \|0.638 \|
		\|toxcast \|logistic regression \|0.737 \|0.543 \|
		\| \|Multitask network \|0.831 \|0.684 \|
		\| \|robust MT-NN \|0.814 \|0.692 \|
		\| \|graph convolution \|0.820 \|0.692 \|
		\|tox21 \|logistic regression \|0.903 \|0.735 \|
		\| \|Multitask network \|0.856 \|0.783 \|
		\| \|robust MT-NN \|0.855 \|0.773 \|
		\| \|graph convolution \|0.865 \|0.827 \|
		\|muv \|logistic regression \|0.957 \|0.719 \|
		\| \|Multitask network \|0.902 \|0.734 \|
		\| \|robust MT-NN \|0.933 \|0.732 \|
		\| \|graph convolution \|0.860 \|0.730 \|
		\|pcba \|logistic regression \|0.808 \|0.776 \|
		\| \|Multitask network \|0.811 \|0.778 \|
		\| \|robust MT-NN \|0.811 \|0.771 \|
		\| \|graph convolution \|0.872 \|0.844 \|
		\|sider \|logistic regression \|0.929 \|0.656 \|
		\| \|Multitask network \|0.777 \|0.655 \|
		\| \|robust MT-NN \|0.804 \|0.630 \|
		\| \|graph convolution \|0.705 \|0.618 \|
		\|toxcast \|logistic regression \|0.725 \|0.586 \|
		\| \|Multitask network \|0.836 \|0.684 \|
		\| \|robust MT-NN \|0.822 \|0.681 \|
		\| \|graph convolution \|0.820 \|0.717 \|

		Scaffold splitting

		@@ -246,11 +246,14 @@ Scaffold splitting
		\|Dataset \|Model \|Splitting \|Train score/R2\|Valid score/R2\|
		\|-----------\|--------------------\|------------\|--------------\|--------------\|
		\|delaney \|MT-NN regression \|Index \|0.773 \|0.574 \|
		\| \|graphconv regression\|Index \|0.964 \|0.829 \|
		\| \|graphconv regression\|Index \|0.991 \|0.825 \|
		\| \|MT-NN regression \|Random \|0.769 \|0.591 \|
		\| \|graphconv regression\|Random \|0.959 \|0.821 \|
		\| \|graphconv regression\|Random \|0.996 \|0.873 \|
		\| \|MT-NN regression \|Scaffold \|0.782 \|0.426 \|
		\| \|graphconv regression\|Scaffold \|0.976 \|0.581 \|
		\| \|graphconv regression\|Scaffold \|0.994 \|0.606 \|
		\|nci \|MT-NN regression \|Index \|0.890 \|0.890 \|
		\| \|MT-NN regression \|Random \|0.891 \|0.888 \|
		\| \|MT-NN regression \|Scaffold \|0.912 \|0.020 \|
		\|kaggle \|MT-NN regression \|User-defined\|0.748 \|0.452 \|

		* General features
		@@ -266,6 +269,7 @@ Number of tasks and examples in the datasets
		\|toxcast \|617 \|8615 \|
		\|delaney \|1 \|1128 \|
		\|kaggle \|15 \|173065 \|
		\|nci \|60 \|1057371 \|

		Time needed for benchmark test(~20h in total)

		@@ -292,6 +296,8 @@ Time needed for benchmark test(~20h in total)
		\| \|robust MT-NN \|80 \|4000 \|
		\| \|graph convolution \|80 \|900 \|
		\|delaney \|MT-NN regression \|10 \|40 \|
		\| \|graphconv regression\|10 \|40 \|
		\|nci \|MT-NN regression \|2000 \|30000 \|
		\|kaggle \|MT-NN regression \|2200 \|3200 \|

deepchem/feat/graph_features.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -110,7 +110,7 @@ def atom_features(atom, bool_id_feat=False):
		'Sb', 'Sn', 'Ag', 'Pd', 'Co', 'Se', 'Ti', 'Zn', 'H', # H?
		'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In', 'Mn', 'Zr',
		'Cr', 'Pt', 'Hg', 'Pb', 'Unknown']) +
		one_of_k_encoding(atom.GetDegree(), [0, 1, 2, 3, 4, 5, 6]) +
		one_of_k_encoding(atom.GetDegree(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) +
		one_of_k_encoding_unk(atom.GetTotalNumHs(), [0, 1, 2, 3, 4]) +
		one_of_k_encoding_unk(atom.GetImplicitValence(), [0, 1, 2, 3, 4, 5, 6]) +
		[atom.GetFormalCharge(), atom.GetNumRadicalElectrons()] +

deepchem/feat/mol_graphs.py

+3 −3

Original line number	Diff line number	Diff line
		@@ -47,7 +47,7 @@ class ConvMol(object):
		Resorts order of atoms internally to be in order of increasing degree. Note
		that only heavy atoms (hydrogens excluded) are considered here.
		"""
		def __init__(self, atom_features, adj_list, max_deg=6, min_deg=0):
		def __init__(self, atom_features, adj_list, max_deg=10, min_deg=0):
		"""
		Parameters
		----------
		@@ -223,7 +223,7 @@ class ConvMol(object):

		# TODO(rbharath): Can this be removed?
		@staticmethod
		def get_null_mol(n_feat, max_deg=6, min_deg=0):
		def get_null_mol(n_feat, max_deg=10, min_deg=0):
		"""Constructs a null molecules

		Get one molecule with one atom of each degree, with all the atoms
		@@ -243,7 +243,7 @@ class ConvMol(object):
		return ConvMol(atom_features, canon_adj_list)

		@staticmethod
		def agglomerate_mols(mols, max_deg=6, min_deg=0):
		def agglomerate_mols(mols, max_deg=10, min_deg=0):
		"""Concatenates list of ConvMol's into one mol object that can be used to feed
		into tensorflow placeholders. The indexing of the molecules are preseved during the
		combination, but the indexing of the atoms are greatly changed.

deepchem/feat/tests/test_mol_graphs.py

+9 −3

Original line number	Diff line number	Diff line
		@@ -49,7 +49,12 @@ class TestMolGraphs(unittest.TestCase):
		# 0 atoms of degree 4
		# 0 atoms of degree 5
		# 0 atoms of degree 6
		np.array([[0, 0], [0, 0], [0, 4], [0, 0], [0, 0], [0, 0], [0,0]]))
		# 0 atoms of degree 7
		# 0 atoms of degree 8
		# 0 atoms of degree 9
		# 0 atoms of degree 10
		np.array([[0, 0], [0, 0], [0, 4], [0, 0], [0, 0], [0, 0], [0, 0],
		[0, 0], [0, 0], [0, 0], [0, 0]]))

		def test_get_atom_features(self):
		"""Test that the atom features are computed properly."""
		@@ -168,10 +173,11 @@ class TestMolGraphs(unittest.TestCase):

		# Check that atoms are only connected to themselves.
		assert np.array_equal(
		deg_adj_lists[6], [[6, 6, 6, 6, 6, 6]])
		deg_adj_lists[10], [[10, 10, 10, 10, 10, 10, 10, 10, 10, 10]])
		assert np.array_equal(
		deg_adj_lists[1], [[1]])
		# Check that there's one atom of each degree.
		assert np.array_equal(
		null_mol.get_deg_slice(),
		[[0, 1], [1, 1], [2, 1], [3, 1], [4, 1], [5, 1], [6, 1]])
		[[0, 1], [1, 1], [2, 1], [3, 1], [4, 1], [5, 1], [6, 1],
		[7, 1], [8, 1], [9, 1], [10, 1]])

deepchem/models/tests/test_overfit.py

+5 −5

Original line number	Diff line number	Diff line
		@@ -486,7 +486,7 @@ class TestOverfit(test_util.TensorFlowTestCase):
		classification_metric = dc.metrics.Metric(
		dc.metrics.accuracy_score)

		n_feat = 71
		n_feat = 75
		batch_size = 10
		graph_model = dc.nn.SequentialGraph(n_feat)
		graph_model.add(dc.nn.GraphConv(64, activation='relu'))
		@@ -537,7 +537,7 @@ class TestOverfit(test_util.TensorFlowTestCase):
		dc.metrics.mean_squared_error,
		task_averager=np.mean)

		n_feat = 71
		n_feat = 75
		batch_size = 10
		graph_model = dc.nn.SequentialGraph(n_feat)
		graph_model.add(dc.nn.GraphConv(64, activation='relu'))
		@@ -572,7 +572,7 @@ class TestOverfit(test_util.TensorFlowTestCase):
		K.set_session(sess)
		with g.as_default():
		n_tasks = 1
		n_feat = 71
		n_feat = 75
		max_depth = 4
		n_pos = 6
		n_neg = 4
		@@ -637,7 +637,7 @@ class TestOverfit(test_util.TensorFlowTestCase):
		K.set_session(sess)
		with g.as_default():
		n_tasks = 1
		n_feat = 71
		n_feat = 75
		max_depth = 4
		n_pos = 6
		n_neg = 4
		@@ -704,7 +704,7 @@ class TestOverfit(test_util.TensorFlowTestCase):
		K.set_session(sess)
		with g.as_default():
		n_tasks = 1
		n_feat = 71
		n_feat = 75
		max_depth = 4
		n_pos = 6
		n_neg = 4

Original line number	Diff line number	Diff line
		@@ -195,26 +195,26 @@ Random splitting

		\|Dataset \|Model \|Train score/ROC-AUC\|Valid score/ROC-AUC\|
		\|-----------\|--------------------\|-------------------\|-------------------\|
		\|tox21 \|logistic regression \|0.903 \|0.741 \|
		\| \|Multitask network \|0.846 \|0.812 \|
		\| \|robust MT-NN \|0.844 \|0.793 \|
		\| \|graph convolution \|0.872 \|0.816 \|
		\|muv \|logistic regression \|0.961 \|0.696 \|
		\| \|Multitask network \|0.895 \|0.740 \|
		\| \|robust MT-NN \|0.914 \|0.667 \|
		\| \|graph convolution \|0.846 \|0.776 \|
		\|pcba \|logistic regression \|0.807 \|0.772 \|
		\| \|Multitask network \|0.811 \|0.787 \|
		\| \|robust MT-NN \|0.809 \|0.778 \|
		\| \|graph convolution \|0.875 \|0.844 \|
		\|sider \|logistic regression \|0.932 \|0.628 \|
		\| \|Multitask network \|0.779 \|0.665 \|
		\| \|robust MT-NN \|0.761 \|0.621 \|
		\| \|graph convolution \|0.706 \|0.638 \|
		\|toxcast \|logistic regression \|0.737 \|0.543 \|
		\| \|Multitask network \|0.831 \|0.684 \|
		\| \|robust MT-NN \|0.814 \|0.692 \|
		\| \|graph convolution \|0.820 \|0.692 \|
		\|tox21 \|logistic regression \|0.903 \|0.735 \|
		\| \|Multitask network \|0.856 \|0.783 \|
		\| \|robust MT-NN \|0.855 \|0.773 \|
		\| \|graph convolution \|0.865 \|0.827 \|
		\|muv \|logistic regression \|0.957 \|0.719 \|
		\| \|Multitask network \|0.902 \|0.734 \|
		\| \|robust MT-NN \|0.933 \|0.732 \|
		\| \|graph convolution \|0.860 \|0.730 \|
		\|pcba \|logistic regression \|0.808 \|0.776 \|
		\| \|Multitask network \|0.811 \|0.778 \|
		\| \|robust MT-NN \|0.811 \|0.771 \|
		\| \|graph convolution \|0.872 \|0.844 \|
		\|sider \|logistic regression \|0.929 \|0.656 \|
		\| \|Multitask network \|0.777 \|0.655 \|
		\| \|robust MT-NN \|0.804 \|0.630 \|
		\| \|graph convolution \|0.705 \|0.618 \|
		\|toxcast \|logistic regression \|0.725 \|0.586 \|
		\| \|Multitask network \|0.836 \|0.684 \|
		\| \|robust MT-NN \|0.822 \|0.681 \|
		\| \|graph convolution \|0.820 \|0.717 \|

		Scaffold splitting

		@@ -246,11 +246,14 @@ Scaffold splitting
		\|Dataset \|Model \|Splitting \|Train score/R2\|Valid score/R2\|
		\|-----------\|--------------------\|------------\|--------------\|--------------\|
		\|delaney \|MT-NN regression \|Index \|0.773 \|0.574 \|
		\| \|graphconv regression\|Index \|0.964 \|0.829 \|
		\| \|graphconv regression\|Index \|0.991 \|0.825 \|
		\| \|MT-NN regression \|Random \|0.769 \|0.591 \|
		\| \|graphconv regression\|Random \|0.959 \|0.821 \|
		\| \|graphconv regression\|Random \|0.996 \|0.873 \|
		\| \|MT-NN regression \|Scaffold \|0.782 \|0.426 \|
		\| \|graphconv regression\|Scaffold \|0.976 \|0.581 \|
		\| \|graphconv regression\|Scaffold \|0.994 \|0.606 \|
		\|nci \|MT-NN regression \|Index \|0.890 \|0.890 \|
		\| \|MT-NN regression \|Random \|0.891 \|0.888 \|
		\| \|MT-NN regression \|Scaffold \|0.912 \|0.020 \|
		\|kaggle \|MT-NN regression \|User-defined\|0.748 \|0.452 \|

		* General features
		@@ -266,6 +269,7 @@ Number of tasks and examples in the datasets
		\|toxcast \|617 \|8615 \|
		\|delaney \|1 \|1128 \|
		\|kaggle \|15 \|173065 \|
		\|nci \|60 \|1057371 \|

		Time needed for benchmark test(~20h in total)

		@@ -292,6 +296,8 @@ Time needed for benchmark test(~20h in total)
		\| \|robust MT-NN \|80 \|4000 \|
		\| \|graph convolution \|80 \|900 \|
		\|delaney \|MT-NN regression \|10 \|40 \|
		\| \|graphconv regression\|10 \|40 \|
		\|nci \|MT-NN regression \|2000 \|30000 \|
		\|kaggle \|MT-NN regression \|2200 \|3200 \|

Admin message