temp save (de190ef8) · Commits · 钟慕尧 / deepchem

README.md

+12 −10

Original line number	Diff line number	Diff line
		@@ -220,7 +220,7 @@ Index splitting
		\| \|MT-NN classification\|0.934 \|0.830 \|
		\| \|Robust MT-NN \|0.949 \|0.827 \|
		\| \|Graph convolution \|0.946 \|0.860 \|
		\| \|DAG \|0.953 \|0.775 \|
		\| \|Weave \|0.907 \|0.879 \|
		\|hiv \|Logistic regression \|0.864 \|0.739 \|
		\| \|Random forest \|0.999 \|0.720 \|
		\| \|XGBoost \|0.917 \|0.745 \|
		@@ -252,7 +252,7 @@ Index splitting
		\| \|MT-NN classification\|0.856 \|0.763 \|
		\| \|Robust MT-NN \|0.857 \|0.767 \|
		\| \|Graph convolution \|0.872 \|0.798 \|
		\| \|DAG \|0.831 \|0.750 \|
		\| \|Weave \|0.810 \|0.778 \|
		\|toxcast \|Logistic regression \|0.721 \|0.575 \|
		\| \|XGBoost \|0.738 \|0.621 \|
		\| \|MT-NN classification\|0.830 \|0.678 \|
		@@ -282,7 +282,7 @@ Random splitting
		\| \|MT-NN classification\|0.951 \|0.834 \|
		\| \|Robust MT-NN \|0.959 \|0.830 \|
		\| \|Graph convolution \|0.975 \|0.876 \|
		\| \|DAG \|0.917 \|0.744 \|
		\| \|Weave \|0.890 \|0.738 \|
		\|hiv \|Logistic regression \|0.860 \|0.806 \|
		\| \|Random forest \|0.999 \|0.850 \|
		\| \|XGBoost \|0.933 \|0.841 \|
		@@ -313,7 +313,7 @@ Random splitting
		\| \|MT-NN classification\|0.844 \|0.795 \|
		\| \|Robust MT-NN \|0.855 \|0.773 \|
		\| \|Graph convolution \|0.865 \|0.827 \|
		\| \|DAG \|0.872 \|0.758 \|
		\| \|Weave \|0.796 \|0.781 \|
		\|toxcast \|Logistic regression \|0.725 \|0.586 \|
		\| \|XGBoost \|0.738 \|0.633 \|
		\| \|MT-NN classification\|0.836 \|0.684 \|
		@@ -343,7 +343,7 @@ Scaffold splitting
		\| \|MT-NN classification\|0.937 \|0.828 \|
		\| \|Robust MT-NN \|0.956 \|0.821 \|
		\| \|Graph convolution \|0.965 \|0.900 \|
		\| \|DAG \|0.925 \|0.703 \|
		\| \|Weave \|0.888 \|0.873 \|
		\|hiv \|Logistic regression \|0.858 \|0.798 \|
		\| \|Random forest \|0.946 \|0.562 \|
		\| \|XGBoost \|0.927 \|0.830 \|
		@@ -374,7 +374,7 @@ Scaffold splitting
		\| \|MT-NN classification\|0.863 \|0.703 \|
		\| \|Robust MT-NN \|0.861 \|0.710 \|
		\| \|Graph convolution \|0.885 \|0.732 \|
		\| \|DAG \|0.861 \|0.670 \|
		\| \|Weave \|0.812 \|0.727 \|
		\|toxcast \|Logistic regression \|0.716 \|0.492 \|
		\| \|XGBoost \|0.741 \|0.587 \|
		\| \|MT-NN classification\|0.828 \|0.617 \|
		@@ -411,17 +411,17 @@ Scaffold splitting
		\| \|XGBoost \|Index \|0.898 \|0.664 \|
		\| \|NN regression \|Index \|0.868 \|0.578 \|
		\| \|Graphconv regression\|Index \|0.967 \|0.790 \|
		\| \|DAG regression \|Index \|0.921 \|0.827 \|
		\| \|Weave regression \|Index \|0.967 \|0.860 \|
		\| \|Random forest \|Random \|0.951 \|0.684 \|
		\| \|XGBoost \|Random \|0.927 \|0.727 \|
		\| \|NN regression \|Random \|0.865 \|0.574 \|
		\| \|Graphconv regression\|Random \|0.964 \|0.782 \|
		\| \|DAG regression \|Random \|0.898 \|0.857 \|
		\| \|Weave regression \|Random \|0.965 \|0.925 \|
		\| \|Random forest \|Scaffold \|0.953 \|0.284 \|
		\| \|XGBoost \|Scaffold \|0.890 \|0.316 \|
		\| \|NN regression \|Scaffold \|0.866 \|0.342 \|
		\| \|Graphconv regression\|Scaffold \|0.967 \|0.606 \|
		\| \|DAG regression \|Scaffold \|0.931 \|0.647 \|
		\| \|Weave regression \|Scaffold \|0.968 \|0.752 \|
		\|hopv \|Random forest \|Index \|0.943 \|0.338 \|
		\| \|MT-NN regression \|Index \|0.725 \|0.293 \|
		\| \|Graphconv regression\|Index \|0.307 \|0.284 \|
		@@ -496,15 +496,17 @@ Scaffold splitting
		\| \|XGBoost \|Index \|0.884 \|0.784 \|
		\| \|NN regression \|Index \|0.917 \|0.764 \|
		\| \|Graphconv regression\|Index \|0.982 \|0.903 \|
		\| \|DAG regression \|Index \|0.891 \|0.777 \|
		\| \|Weave regression \|Index \|0.986 \|0.924 \|
		\| \|Random forest \|Random \|0.967 \|0.752 \|
		\| \|XGBoost \|Random \|0.906 \|0.745 \|
		\| \|NN regression \|Random \|0.908 \|0.711 \|
		\| \|Graphconv regression\|Random \|0.987 \|0.868 \|
		\| \|Weave regression \|Random \|0.997 \|0.888 \|
		\| \|Random forest \|Scaffold \|0.966 \|0.477 \|
		\| \|XGBoost \|Scaffold \|0.918 \|0.439 \|
		\| \|NN regression \|Scaffold \|0.891 \|0.217 \|
		\| \|Graphconv regression\|Scaffold \|0.985 \|0.666 \|
		\| \|Weave regression \|Scaffold \|0.991 \|0.833 \|

		\|Dataset \|Model \|Splitting \|Train score/MAE(kcal/mol)\|Valid score/MAE(kcal/mol)\|
		\|----------------\|-----------------\|------------\|-------------------------\|-------------------------\|

deepchem/models/tf_new_models/graph_models.py

+5 −7

Original line number	Diff line number	Diff line
		@@ -166,14 +166,15 @@ class SequentialWeaveGraph(SequentialGraph):
		"""SequentialGraph for Weave models
		"""

		def __init__(self, max_atoms=50, n_atom_feat=75, n_pair_feat=14):
		def __init__(self, batch_size, n_atom_feat=75, n_pair_feat=14, max_atoms=100):
		self.graph = tf.Graph()
		self.batch_size = batch_size
		self.max_atoms = max_atoms
		self.n_atom_feat = n_atom_feat
		self.n_pair_feat = n_pair_feat
		with self.graph.as_default():
		self.graph_topology = WeaveGraphTopology(self.max_atoms, self.n_atom_feat,
		self.n_pair_feat)
		self.graph_topology = WeaveGraphTopology(self.batch_size, self.n_atom_feat,
		self.n_pair_feat, self.max_atoms)
		self.output = self.graph_topology.get_atom_features_placeholder()
		self.output_P = self.graph_topology.get_pair_features_placeholder()
		self.layers = []
		@@ -185,12 +186,9 @@ class SequentialWeaveGraph(SequentialGraph):
		self.output, self.output_P = layer([
		self.output, self.output_P
		] + self.graph_topology.get_topology_placeholders())
		elif type(layer).__name__ in ['WeaveConcat']:
		self.output = layer(
		[self.output, self.graph_topology.atom_mask_placeholder])
		elif type(layer).__name__ in ['WeaveGather']:
		self.output = layer(
		[self.output, self.graph_topology.membership_placeholder])
		[self.output, self.graph_topology.atom_split_placeholder])
		else:
		self.output = layer(self.output)
		self.layers.append(layer)

deepchem/models/tf_new_models/graph_topology.py

+53 −39

Original line number	Diff line number	Diff line
		@@ -397,8 +397,8 @@ class DAGGraphTopology(GraphTopology):
		class WeaveGraphTopology(GraphTopology):
		"""Manages placeholders associated with batch of graphs and their topology"""

		def __init__(self, max_atoms, n_atom_feat, n_pair_feat,
		name='Weave_topology'):
		def __init__(self, batch_size, n_atom_feat, n_pair_feat,
		max_atoms=100, name='Weave_topology'):
		"""
		Parameters
		----------
		@@ -412,30 +412,36 @@ class WeaveGraphTopology(GraphTopology):

		#self.n_atoms = n_atoms
		self.name = name
		self.max_atoms = max_atoms
		self.batch_size = batch_size
		self.n_atom_feat = n_atom_feat
		self.n_pair_feat = n_pair_feat
		self.max_atoms = max_atoms * batch_size

		self.atom_features_placeholder = tf.placeholder(
		dtype='float32',
		shape=(None, self.max_atoms, self.n_atom_feat),
		shape=(None, self.n_atom_feat),
		name=self.name + '_atom_features')
		self.atom_mask_placeholder = tf.placeholder(
		dtype='float32',
		shape=(None, self.max_atoms),
		name=self.name + '_atom_mask')
		self.pair_features_placeholder = tf.placeholder(
		dtype='float32',
		shape=(None, self.max_atoms, self.max_atoms, self.n_pair_feat),
		shape=(None, self.n_pair_feat),
		name=self.name + '_pair_features')
		self.pair_mask_placeholder = tf.placeholder(
		dtype='float32',
		shape=(None, self.max_atoms, self.max_atoms),
		name=self.name + '_pair_mask')
		self.membership_placeholder = tf.placeholder(
		dtype='int32', shape=(None,), name=self.name + '_membership')
		self.pair_split_placeholder = tf.placeholder(
		dtype='int32', shape=(self.max_atoms,),
		name=self.name + '_pair_split')
		self.pair_membership_placeholder = tf.placeholder(
		dtype='bool', shape=(self.max_atoms,),
		name=self.name + '_pair_membership')
		self.atom_split_placeholder = tf.placeholder(
		dtype='int32', shape=(self.batch_size,),
		name=self.name + '_atom_split')
		self.atom_to_pair_placeholder = tf.placeholder(
		dtype='int32', shape=(None,2),
		name=self.name + '_atom_to_pair')


		# Define the list of tensors to be used as topology
		self.topology = [self.atom_mask_placeholder, self.pair_mask_placeholder]
		self.topology = [self.pair_split_placeholder, self.pair_membership_placeholder,
		self.atom_split_placeholder, self.atom_to_pair_placeholder]
		self.inputs = [self.atom_features_placeholder]
		self.inputs += self.topology

		@@ -461,34 +467,42 @@ class WeaveGraphTopology(GraphTopology):
		# Extract atom numbers
		atom_feat = []
		pair_feat = []
		atom_mask = []
		pair_mask = []
		membership = []
		atom_split = []
		atom_to_pair = []
		pair_split = []
		max_atoms = self.max_atoms
		start = 0
		for im, mol in enumerate(batch):
		n_atoms = mol.get_num_atoms()
		atom_feat.append(
		np.pad(mol.get_atom_features(), ((0, max_atoms - n_atoms), (0, 0)),
		'constant'))
		atom_mask.append(
		np.array([1] * n_atoms + [0] * (max_atoms - n_atoms), dtype=float))
		pair_feat.append(
		np.pad(mol.get_pair_features(), ((0, max_atoms - n_atoms), (
		0, max_atoms - n_atoms), (0, 0)), 'constant'))
		pair_mask.append(np.array([[1]n_atoms + [0](max_atoms-n_atoms)]*n_atoms + \
		[[0]max_atoms](max_atoms-n_atoms), dtype=float))
		membership.extend([im] * n_atoms)
		atom_feat = np.stack(atom_feat)
		pair_feat = np.stack(pair_feat)
		atom_mask = np.stack(atom_mask)
		pair_mask = np.stack(pair_mask)
		membership = np.array(membership)
		# number of atoms in each molecule
		atom_split.append(n_atoms)
		# index of pair features
		C0, C1 = np.meshgrid(np.arange(n_atoms), np.arange(n_atoms))
		atom_to_pair.append(np.transpose(np.array([C1.flatten()+start, C0.flatten()+start])))
		start = start + n_atoms
		# number of pairs for each atom
		pair_split.extend([n_atoms]*n_atoms)
		# atom features
		atom_feat.append(mol.get_atom_features())
		# pair features
		pair_feat.append(np.reshape(mol.get_pair_features(),
		(n_atoms*n_atoms, self.n_pair_feat)))

		atom_feat = np.concatenate(atom_feat, axis=0)
		pair_feat = np.concatenate(pair_feat, axis=0)
		atom_to_pair = np.concatenate(atom_to_pair, axis=0)
		atom_split = np.array(atom_split)
		n_pair = len(pair_split)
		pair_split = np.pad(pair_split, ((0, max_atoms-n_pair)), 'constant')
		pair_membership = np.array([True]n_pair + [False](max_atoms-n_pair))

		# Generate dicts
		dict_DTNN = {
		self.atom_features_placeholder: atom_feat,
		self.pair_features_placeholder: pair_feat,
		self.atom_mask_placeholder: atom_mask,
		self.pair_mask_placeholder: pair_mask,
		self.membership_placeholder: membership
		self.pair_split_placeholder: pair_split,
		self.pair_membership_placeholder: pair_membership,
		self.atom_split_placeholder: atom_split,
		self.atom_to_pair_placeholder: atom_to_pair
		}
		return dict_DTNN

deepchem/molnet/preset_hyper_parameters.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -67,7 +67,7 @@ hps['dag'] = {
		}
		hps['weave'] = {
		'batch_size': 64,
		'nb_epoch': 50,
		'nb_epoch': 40,
		'learning_rate': 0.001,
		'n_graph_feat': 128,
		'n_pair_feat': 14,

deepchem/molnet/run_benchmark_models.py

+10 −16

Original line number	Diff line number	Diff line
		@@ -266,15 +266,11 @@ def benchmark_classification(train_dataset,
		n_graph_feat = hyper_parameters['n_graph_feat']
		n_pair_feat = hyper_parameters['n_pair_feat']

		max_atoms_train = max([mol.get_num_atoms() for mol in train_dataset.X])
		max_atoms_valid = max([mol.get_num_atoms() for mol in valid_dataset.X])
		max_atoms_test = max([mol.get_num_atoms() for mol in test_dataset.X])
		max_atoms = max([max_atoms_train, max_atoms_valid, max_atoms_test])

		graph_model = deepchem.nn.SequentialWeaveGraph(
		max_atoms=max_atoms, n_atom_feat=n_features, n_pair_feat=n_pair_feat)
		graph_model.add(deepchem.nn.WeaveLayer(max_atoms, 75, 14))
		graph_model.add(deepchem.nn.WeaveConcat(batch_size, n_output=n_graph_feat))
		batch_size, n_atom_feat=n_features, n_pair_feat=n_pair_feat, max_atoms=120)
		graph_model.add(deepchem.nn.WeaveLayer(75, 14))
		graph_model.add(deepchem.nn.WeaveLayer(50, 50))
		graph_model.add(deepchem.nn.Dense(n_graph_feat, 50, activation='tanh'))
		graph_model.add(deepchem.nn.BatchNormalization(epsilon=1e-5, mode=1))
		graph_model.add(
		deepchem.nn.WeaveGather(
		@@ -286,6 +282,7 @@ def benchmark_classification(train_dataset,
		n_features,
		batch_size=batch_size,
		learning_rate=learning_rate,
		learning_rate_decay_time=1000,
		optimizer_type="adam",
		beta1=.9,
		beta2=.999)
		@@ -588,15 +585,11 @@ def benchmark_regression(train_dataset,
		n_graph_feat = hyper_parameters['n_graph_feat']
		n_pair_feat = hyper_parameters['n_pair_feat']

		max_atoms_train = max([mol.get_num_atoms() for mol in train_dataset.X])
		max_atoms_valid = max([mol.get_num_atoms() for mol in valid_dataset.X])
		max_atoms_test = max([mol.get_num_atoms() for mol in test_dataset.X])
		max_atoms = max([max_atoms_train, max_atoms_valid, max_atoms_test])

		graph_model = deepchem.nn.SequentialWeaveGraph(
		max_atoms=max_atoms, n_atom_feat=n_features, n_pair_feat=n_pair_feat)
		graph_model.add(deepchem.nn.WeaveLayer(max_atoms, 75, 14))
		graph_model.add(deepchem.nn.WeaveConcat(batch_size, n_output=n_graph_feat))
		batch_size, n_atom_feat=n_features, n_pair_feat=n_pair_feat, max_atoms=80)
		graph_model.add(deepchem.nn.WeaveLayer(75, 14))
		graph_model.add(deepchem.nn.WeaveLayer(50, 50))
		graph_model.add(deepchem.nn.Dense(n_graph_feat, 50, activation='tanh'))
		graph_model.add(deepchem.nn.BatchNormalization(epsilon=1e-5, mode=1))
		graph_model.add(
		deepchem.nn.WeaveGather(
		@@ -608,6 +601,7 @@ def benchmark_regression(train_dataset,
		n_features,
		batch_size=batch_size,
		learning_rate=learning_rate,
		learning_rate_decay_time=1000,
		optimizer_type="adam",
		beta1=.9,
		beta2=.999)

Original line number	Diff line number	Diff line
		@@ -220,7 +220,7 @@ Index splitting
		\| \|MT-NN classification\|0.934 \|0.830 \|
		\| \|Robust MT-NN \|0.949 \|0.827 \|
		\| \|Graph convolution \|0.946 \|0.860 \|
		\| \|DAG \|0.953 \|0.775 \|
		\| \|Weave \|0.907 \|0.879 \|
		\|hiv \|Logistic regression \|0.864 \|0.739 \|
		\| \|Random forest \|0.999 \|0.720 \|
		\| \|XGBoost \|0.917 \|0.745 \|
		@@ -252,7 +252,7 @@ Index splitting
		\| \|MT-NN classification\|0.856 \|0.763 \|
		\| \|Robust MT-NN \|0.857 \|0.767 \|
		\| \|Graph convolution \|0.872 \|0.798 \|
		\| \|DAG \|0.831 \|0.750 \|
		\| \|Weave \|0.810 \|0.778 \|
		\|toxcast \|Logistic regression \|0.721 \|0.575 \|
		\| \|XGBoost \|0.738 \|0.621 \|
		\| \|MT-NN classification\|0.830 \|0.678 \|
		@@ -282,7 +282,7 @@ Random splitting
		\| \|MT-NN classification\|0.951 \|0.834 \|
		\| \|Robust MT-NN \|0.959 \|0.830 \|
		\| \|Graph convolution \|0.975 \|0.876 \|
		\| \|DAG \|0.917 \|0.744 \|
		\| \|Weave \|0.890 \|0.738 \|
		\|hiv \|Logistic regression \|0.860 \|0.806 \|
		\| \|Random forest \|0.999 \|0.850 \|
		\| \|XGBoost \|0.933 \|0.841 \|
		@@ -313,7 +313,7 @@ Random splitting
		\| \|MT-NN classification\|0.844 \|0.795 \|
		\| \|Robust MT-NN \|0.855 \|0.773 \|
		\| \|Graph convolution \|0.865 \|0.827 \|
		\| \|DAG \|0.872 \|0.758 \|
		\| \|Weave \|0.796 \|0.781 \|
		\|toxcast \|Logistic regression \|0.725 \|0.586 \|
		\| \|XGBoost \|0.738 \|0.633 \|
		\| \|MT-NN classification\|0.836 \|0.684 \|
		@@ -343,7 +343,7 @@ Scaffold splitting
		\| \|MT-NN classification\|0.937 \|0.828 \|
		\| \|Robust MT-NN \|0.956 \|0.821 \|
		\| \|Graph convolution \|0.965 \|0.900 \|
		\| \|DAG \|0.925 \|0.703 \|
		\| \|Weave \|0.888 \|0.873 \|
		\|hiv \|Logistic regression \|0.858 \|0.798 \|
		\| \|Random forest \|0.946 \|0.562 \|
		\| \|XGBoost \|0.927 \|0.830 \|
		@@ -374,7 +374,7 @@ Scaffold splitting
		\| \|MT-NN classification\|0.863 \|0.703 \|
		\| \|Robust MT-NN \|0.861 \|0.710 \|
		\| \|Graph convolution \|0.885 \|0.732 \|
		\| \|DAG \|0.861 \|0.670 \|
		\| \|Weave \|0.812 \|0.727 \|
		\|toxcast \|Logistic regression \|0.716 \|0.492 \|
		\| \|XGBoost \|0.741 \|0.587 \|
		\| \|MT-NN classification\|0.828 \|0.617 \|
		@@ -411,17 +411,17 @@ Scaffold splitting
		\| \|XGBoost \|Index \|0.898 \|0.664 \|
		\| \|NN regression \|Index \|0.868 \|0.578 \|
		\| \|Graphconv regression\|Index \|0.967 \|0.790 \|
		\| \|DAG regression \|Index \|0.921 \|0.827 \|
		\| \|Weave regression \|Index \|0.967 \|0.860 \|
		\| \|Random forest \|Random \|0.951 \|0.684 \|
		\| \|XGBoost \|Random \|0.927 \|0.727 \|
		\| \|NN regression \|Random \|0.865 \|0.574 \|
		\| \|Graphconv regression\|Random \|0.964 \|0.782 \|
		\| \|DAG regression \|Random \|0.898 \|0.857 \|
		\| \|Weave regression \|Random \|0.965 \|0.925 \|
		\| \|Random forest \|Scaffold \|0.953 \|0.284 \|
		\| \|XGBoost \|Scaffold \|0.890 \|0.316 \|
		\| \|NN regression \|Scaffold \|0.866 \|0.342 \|
		\| \|Graphconv regression\|Scaffold \|0.967 \|0.606 \|
		\| \|DAG regression \|Scaffold \|0.931 \|0.647 \|
		\| \|Weave regression \|Scaffold \|0.968 \|0.752 \|
		\|hopv \|Random forest \|Index \|0.943 \|0.338 \|
		\| \|MT-NN regression \|Index \|0.725 \|0.293 \|
		\| \|Graphconv regression\|Index \|0.307 \|0.284 \|
		@@ -496,15 +496,17 @@ Scaffold splitting
		\| \|XGBoost \|Index \|0.884 \|0.784 \|
		\| \|NN regression \|Index \|0.917 \|0.764 \|
		\| \|Graphconv regression\|Index \|0.982 \|0.903 \|
		\| \|DAG regression \|Index \|0.891 \|0.777 \|
		\| \|Weave regression \|Index \|0.986 \|0.924 \|
		\| \|Random forest \|Random \|0.967 \|0.752 \|
		\| \|XGBoost \|Random \|0.906 \|0.745 \|
		\| \|NN regression \|Random \|0.908 \|0.711 \|
		\| \|Graphconv regression\|Random \|0.987 \|0.868 \|
		\| \|Weave regression \|Random \|0.997 \|0.888 \|
		\| \|Random forest \|Scaffold \|0.966 \|0.477 \|
		\| \|XGBoost \|Scaffold \|0.918 \|0.439 \|
		\| \|NN regression \|Scaffold \|0.891 \|0.217 \|
		\| \|Graphconv regression\|Scaffold \|0.985 \|0.666 \|
		\| \|Weave regression \|Scaffold \|0.991 \|0.833 \|

		\|Dataset \|Model \|Splitting \|Train score/MAE(kcal/mol)\|Valid score/MAE(kcal/mol)\|
		\|----------------\|-----------------\|------------\|-------------------------\|-------------------------\|

Admin message