weave model first draft (d69e919c) · Commits · 钟慕尧 / deepchem

deepchem/feat/graph_features.py

+6 −1

Original line number	Diff line number	Diff line
		@@ -192,13 +192,15 @@ def pair_features(mol, edge_list, canon_adj_list, bt_len=6):
		rings = mol.GetRingInfo().AtomRings()
		for a1 in range(num_atoms):
		for a2 in canon_adj_list[a1]:
		# first `bt_len` features are bond features(if applicable)
		features[a1, a2, :bt_len] = np.asarray(
		edge_list[tuple(sorted((a1, a2)))], dtype=float)
		for ring in rings:
		if a1 in ring:
		# `bt_len`-th feature is if the pair of atoms are in the same ring
		features[a1, ring, bt_len] = 1
		features[a1, a1, bt_len] = 0.
		# find graph distance between two atoms
		# graph distance between two atoms
		distance = find_distance(
		a1, num_atoms, canon_adj_list, max_distance=max_distance)
		features[a1, :, bt_len + 1:] = distance
		@@ -209,11 +211,14 @@ def pair_features(mol, edge_list, canon_adj_list, bt_len=6):
		def find_distance(a1, num_atoms, canon_adj_list, max_distance=7):
		distance = np.zeros((num_atoms, max_distance))
		radial = 0
		# atoms `radial` bonds away from `a1`
		adj_list = set(canon_adj_list[a1])
		# atoms less than `radial` bonds away
		all_list = set([a1])
		while radial < max_distance:
		distance[list(adj_list), radial] = 1
		all_list.update(adj_list)
		# find atoms `radial`+1 bonds away
		next_adj = set()
		for adj in adj_list:
		next_adj.update(canon_adj_list[adj])

deepchem/models/tests/test_overfit.py

+99 −0

Original line number	Diff line number	Diff line
		@@ -752,6 +752,105 @@ class TestOverfit(test_util.TensorFlowTestCase):

		assert scores[regression_metric.name] > .8

		def test_weave_singletask_classification_overfit(self):
		"""Test weave model overfits tiny data."""
		np.random.seed(123)
		tf.set_random_seed(123)
		n_tasks = 1

		# Load mini log-solubility dataset.
		featurizer = dc.feat.WeaveFeaturizer()
		tasks = ["outcome"]
		input_file = os.path.join(self.current_dir, "example_classification.csv")
		loader = dc.data.CSVLoader(
		tasks=tasks, smiles_field="smiles", featurizer=featurizer)
		dataset = loader.featurize(input_file)

		classification_metric = dc.metrics.Metric(dc.metrics.accuracy_score)

		n_atom_feat = 75
		n_pair_feat = 14
		n_feat = 128
		batch_size = 10
		max_atoms = 50

		graph = dc.nn.SequentialWeaveGraph(
		max_atoms=max_atoms, n_atom_feat=n_atom_feat, n_pair_feat=n_pair_feat)
		graph.add(dc.nn.WeaveLayer(max_atoms, 75, 14))
		graph.add(dc.nn.WeaveConcat(batch_size, n_output=n_feat))
		graph.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
		graph.add(dc.nn.WeaveGather(batch_size, n_input=n_feat))

		model = dc.models.MultitaskGraphClassifier(
		graph,
		n_tasks,
		n_feat,
		batch_size=batch_size,
		learning_rate=1e-3,
		learning_rate_decay_time=1000,
		optimizer_type="adam",
		beta1=.9,
		beta2=.999)

		# Fit trained model
		model.fit(dataset, nb_epoch=20)
		model.save()

		# Eval model on train
		scores = model.evaluate(dataset, [classification_metric])

		assert scores[classification_metric.name] > .65

		def test_weave_singletask_regression_overfit(self):
		"""Test weave model overfits tiny data."""
		np.random.seed(123)
		tf.set_random_seed(123)
		n_tasks = 1

		# Load mini log-solubility dataset.
		featurizer = dc.feat.WeaveFeaturizer()
		tasks = ["outcome"]
		input_file = os.path.join(self.current_dir, "example_regression.csv")
		loader = dc.data.CSVLoader(
		tasks=tasks, smiles_field="smiles", featurizer=featurizer)
		dataset = loader.featurize(input_file)

		regression_metric = dc.metrics.Metric(
		dc.metrics.pearson_r2_score, task_averager=np.mean)

		n_atom_feat = 75
		n_pair_feat = 14
		n_feat = 128
		batch_size = 10
		max_atoms = 50

		graph = dc.nn.SequentialWeaveGraph(
		max_atoms=max_atoms, n_atom_feat=n_atom_feat, n_pair_feat=n_pair_feat)
		graph.add(dc.nn.WeaveLayer(max_atoms, 75, 14))
		graph.add(dc.nn.WeaveConcat(batch_size, n_output=n_feat))
		graph.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
		graph.add(dc.nn.WeaveGather(batch_size, n_input=n_feat))

		model = dc.models.MultitaskGraphRegressor(
		graph,
		n_tasks,
		n_feat,
		batch_size=batch_size,
		learning_rate=1e-3,
		learning_rate_decay_time=1000,
		optimizer_type="adam",
		beta1=.9,
		beta2=.999)

		# Fit trained model
		model.fit(dataset, nb_epoch=40)
		model.save()

		# Eval model on train
		scores = model.evaluate(dataset, [regression_metric])

		assert scores[regression_metric.name] > .9

		def test_siamese_singletask_classification_overfit(self):
		"""Test siamese singletask model overfits tiny data."""
		np.random.seed(123)

deepchem/molnet/load_function/tox21_datasets.py

+2 −0

Original line number	Diff line number	Diff line
		@@ -32,6 +32,8 @@ def load_tox21(featurizer='ECFP', split='index', K=4):
		featurizer = deepchem.feat.CircularFingerprint(size=1024)
		elif featurizer == 'GraphConv':
		featurizer = deepchem.feat.ConvMolFeaturizer()
		elif featurizer == 'Weave':
		featurizer = deepchem.feat.WeaveFeaturizer()
		elif featurizer == 'Raw':
		featurizer = deepchem.feat.RawFeaturizer()

deepchem/nn/weave_layers.py

+40 −20

Original line number	Diff line number	Diff line
		@@ -17,6 +17,7 @@ from deepchem.nn import initializations
		from deepchem.nn import model_ops
		from deepchem.nn.copy import Layer


		class WeaveLayer(Layer):
		"""" Main layer of Weave model
		For each molecule, atom features and pair features are recombined to
		@@ -24,6 +25,7 @@ class WeaveLayer(Layer):
		"""

		def __init__(self,
		max_atoms,
		n_atom_input_feat=75,
		n_pair_input_feat=14,
		n_atom_output_feat=50,
		@@ -58,7 +60,7 @@ class WeaveLayer(Layer):

		"""
		super(WeaveLayer, self).__init__(**kwargs)

		self.max_atoms = max_atoms
		self.init = initializations.get(init) # Set weight initialization
		self.activation = activations.get(activation) # Get activations
		self.n_hidden_AA = n_hidden_AA
		@@ -107,9 +109,10 @@ class WeaveLayer(Layer):
		self.n_pair_output_feat,
		])

		self.trainable_weights = [self.W_AA, self.b_AA, self.W_PA, self.b_PA,
		self.W_A, self.b_A, self.W_AP, self.b_AP, self.W_PP, self.b_PP,
		self.W_P, self.b_P]
		self.trainable_weights = [
		self.W_AA, self.b_AA, self.W_PA, self.b_PA, self.W_A, self.b_A,
		self.W_AP, self.b_AP, self.W_PP, self.b_PP, self.W_P, self.b_P
		]

		def call(self, x, mask=None):
		"""Execute this layer on input tensors.
		@@ -138,7 +141,7 @@ class WeaveLayer(Layer):

		atom_mask = x[2]
		pair_mask = x[3]
		max_atoms = atom_features.get_shape().as_list()[1]
		max_atoms = self.max_atoms

		AA = tf.tensordot(atom_features, self.W_AA, [[2], [0]]) + self.b_AA
		AA = self.activation(AA)
		@@ -164,6 +167,7 @@ class WeaveLayer(Layer):
		P = tf.multiply(P, tf.expand_dims(pair_mask, axis=3))
		return A, P


		class WeaveConcat(Layer):
		"""" Concat a batch of molecules into a batch of atoms
		"""
		@@ -202,7 +206,9 @@ class WeaveConcat(Layer):
		"""

		self.W = self.init([self.n_atom_input_feat, self.n_output])
		self.b = model_ops.zeros(shape=[self.n_output,])
		self.b = model_ops.zeros(shape=[
		self.n_output,
		])

		self.trainable_weights = self.W + self.b

		@@ -227,12 +233,15 @@ class WeaveConcat(Layer):
		atom_features = x[0]
		atom_masks = x[1]
		A = tf.split(atom_features, self.batch_size, axis=0)
		A_mask = tf.split(tf.cast(atom_masks, dtype=tf.bool), self.batch_size, axis=0)
		outputs = tf.concat([tf.boolean_mask(A[i], A_mask[i]) for i in range(len(A))], axis=0)
		A_mask = tf.split(
		tf.cast(atom_masks, dtype=tf.bool), self.batch_size, axis=0)
		outputs = tf.concat(
		[tf.boolean_mask(A[i], A_mask[i]) for i in range(len(A))], axis=0)
		outputs = tf.matmul(outputs, self.W) + self.b
		outputs = self.activation(outputs)
		return outputs


		class WeaveGather(Layer):
		"""" Gather layer of Weave model
		a batch of normalized atom features go through a hidden layer,
		@@ -241,7 +250,8 @@ class WeaveGather(Layer):

		def __init__(self,
		batch_size,
		gaussian_expand=True,
		n_input=128,
		gaussian_expand=False,
		epsilon=1e-3,
		momentum=0.99,
		**kwargs):
		@@ -254,7 +264,7 @@ class WeaveGather(Layer):
		Whether to expand each dimension of atomic features by gaussian histogram

		"""

		self.n_input = n_input
		self.batch_size = batch_size
		self.gaussian_expand = gaussian_expand
		self.epsilon = epsilon
		@@ -296,5 +306,15 @@ class WeaveGather(Layer):
		output_molecules = tf.stack(output_molecules)
		return output_molecules

		def gaussian_histogram(x):
		return x
		def gaussian_histogram(self, x):
		gaussian_memberships = [(-1.645, 0.080), (-1.080, 0.029), (-0.739, 0.018),
		(-0.468, 0.014), (-0.228, 0.013), (0., 0.013),
		(0.228, 0.013), (0.468, 0.014), (0.739, 0.018),
		(1.080, 0.029), (1.645, 0.080)]
		dist = [
		tf.contrib.distributions.Normal(mu=p[0], sigma=p[1])
		for p in gaussian_memberships
		]
		outputs = [dist[i].pdf(x) for i in range(11)]
		outputs = tf.concat(outputs, axis=1)
		return outputs

examples/delaney/delaney_weave.py

+10 −12

Original line number	Diff line number	Diff line
		@@ -24,35 +24,33 @@ max_atoms_valid = max([mol.get_num_atoms() for mol in valid_dataset.X])
		max_atoms_test = max([mol.get_num_atoms() for mol in test_dataset.X])
		max_atoms = max([max_atoms_train, max_atoms_valid, max_atoms_test])


		n_atom_feat = 75
		n_pair_feat = 14
		max_atoms = 55
		# Batch size of models
		batch_size = 64
		n_output = 128
		graph = dc.nn.SequentialWeaveGraph(max_atoms=max_atoms,
		n_atom_feat=n_atom_feat,
		n_pair_feat=n_pair_feat)
		n_feat = 128
		graph = dc.nn.SequentialWeaveGraph(
		max_atoms=max_atoms, n_atom_feat=n_atom_feat, n_pair_feat=n_pair_feat)

		graph.add(dc.nn.WeaveLayer())
		graph.add(dc.nn.WeaveConcat(batch_size))
		graph.add(dc.nn.WeaveLayer(max_atoms, 75, 14))
		#graph.add(dc.nn.WeaveLayer(max_atoms, 50, 50))
		graph.add(dc.nn.WeaveConcat(batch_size, n_output=n_feat))
		graph.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
		graph.add(dc.nn.WeaveGather(batch_size, gaussian_expand=False))
		graph.add(dc.nn.WeaveGather(batch_size, n_input=n_feat, gaussian_expand=False))

		model = dc.models.MultitaskGraphRegressor(
		graph,
		len(delaney_tasks),
		128,
		n_feat,
		batch_size=batch_size,
		learning_rate=1e-4,
		learning_rate=1e-3,
		learning_rate_decay_time=1000,
		optimizer_type="adam",
		beta1=.9,
		beta2=.999)

		# Fit trained model
		model.fit(train_dataset, nb_epoch=40, log_every_N_batches=50)
		model.fit(train_dataset, nb_epoch=50, log_every_N_batches=50)
		print("Evaluating model")
		train_scores = model.evaluate(train_dataset, [metric], transformers)
		valid_scores = model.evaluate(valid_dataset, [metric], transformers)

Admin message