Merge pull request #1473 from VIGS25/textcnn-estimator (8b630252) · Commits · 钟慕尧 / deepchem

deepchem/models/tensorgraph/graph_layers.py

+6 −4

Original line number	Diff line number	Diff line
		@@ -335,13 +335,16 @@ class DTNNEmbedding(Layer):
		if in_layers is None:
		in_layers = self.in_layers
		in_layers = convert_to_layers(in_layers)

		self.build()

		atom_number = in_layers[0].out_tensor
		atom_number = tf.cast(atom_number, dtype=tf.int32)
		atom_features = tf.nn.embedding_lookup(self.embedding_list, atom_number)
		out_tensor = atom_features
		if set_tensors:
		self.variables = self.trainable_weights
		self.out_tensor = atom_features
		return out_tensor

		def none_tensors(self):
		embedding_list = self.embedding_list
		@@ -417,9 +420,8 @@ class DTNNStep(Layer):
		distance_membership_j = in_layers[3].out_tensor
		distance_hidden = tf.matmul(distance, self.W_df) + self.b_df
		atom_features_hidden = tf.matmul(atom_features, self.W_cf) + self.b_cf
		outputs = tf.multiply(distance_hidden,
		tf.gather(atom_features_hidden,
		distance_membership_j))
		outputs = tf.multiply(
		distance_hidden, tf.gather(atom_features_hidden, distance_membership_j))

		# for atom i in a molecule m, this step multiplies together distance info of atom pair(i,j)
		# and embeddings of atom j(both gone through a hidden layer)

deepchem/models/tensorgraph/models/text_cnn.py

+56 −51

Original line number	Diff line number	Diff line
		@@ -13,7 +13,7 @@ from deepchem.models.tensorgraph.layers import Dense, Concat, SoftMax, \
		Conv1D, ReduceMax, Squeeze, Stack, Highway
		from deepchem.models.tensorgraph.graph_layers import DTNNEmbedding

		from deepchem.models.tensorgraph.layers import L2Loss, Label, Weights, Feature
		from deepchem.models.tensorgraph.layers import L2Loss, Label, Weights, Feature, Reshape, ReduceSum
		from deepchem.models.tensorgraph.tensor_graph import TensorGraph
		from deepchem.trans import undo_transforms

		@@ -118,7 +118,7 @@ class TextCNNModel(TensorGraph):
		self.dropout = dropout
		self.mode = mode
		super(TextCNNModel, self).__init__(**kwargs)
		self.build_graph()
		self._build_graph()

		@staticmethod
		def build_char_dict(dataset, default_dict=default_dict):
		@@ -157,63 +157,53 @@ class TextCNNModel(TensorGraph):
		current_key_val += 1
		return out_dict, seq_length

		def build_graph(self):
		def _build_graph(self):
		self.smiles_seqs = Feature(shape=(None, self.seq_length), dtype=tf.int32)
		# Character embedding
		self.Embedding = DTNNEmbedding(
		Embedding = DTNNEmbedding(
		n_embedding=self.n_embedding,
		periodic_table_length=len(self.char_dict.keys()) + 1,
		in_layers=[self.smiles_seqs])
		self.pooled_outputs = []
		self.conv_layers = []
		pooled_outputs = []
		conv_layers = []
		for filter_size, num_filter in zip(self.kernel_sizes, self.num_filters):
		# Multiple convolutional layers with different filter widths
		self.conv_layers.append(
		conv_layers.append(
		Conv1D(
		kernel_size=filter_size,
		filters=num_filter,
		padding='valid',
		in_layers=[self.Embedding]))
		in_layers=[Embedding]))
		# Max-over-time pooling
		self.pooled_outputs.append(
		ReduceMax(axis=1, in_layers=[self.conv_layers[-1]]))
		pooled_outputs.append(ReduceMax(axis=1, in_layers=[conv_layers[-1]]))
		# Concat features from all filters(one feature per filter)
		concat_outputs = Concat(axis=1, in_layers=self.pooled_outputs)
		concat_outputs = Concat(axis=1, in_layers=pooled_outputs)
		dropout = Dropout(dropout_prob=self.dropout, in_layers=[concat_outputs])
		dense = Dense(
		out_channels=200, activation_fn=tf.nn.relu, in_layers=[dropout])
		# Highway layer from https://arxiv.org/pdf/1505.00387.pdf
		self.gather = Highway(in_layers=[dense])
		gather = Highway(in_layers=[dense])

		costs = []
		self.labels_fd = []
		for task in range(self.n_tasks):
		if self.mode == "classification":
		classification = Dense(
		out_channels=2, activation_fn=None, in_layers=[self.gather])
		softmax = SoftMax(in_layers=[classification])
		self.add_output(softmax)
		logits = Dense(
		out_channels=self.n_tasks * 2, activation_fn=None, in_layers=[gather])
		logits = Reshape(shape=(-1, self.n_tasks, 2), in_layers=[logits])
		output = SoftMax(in_layers=[logits])
		self.add_output(output)
		labels = Label(shape=(None, self.n_tasks, 2))
		loss = SoftMaxCrossEntropy(in_layers=[labels, logits])

		label = Label(shape=(None, 2))
		self.labels_fd.append(label)
		cost = SoftMaxCrossEntropy(in_layers=[label, classification])
		costs.append(cost)
		if self.mode == "regression":
		regression = Dense(
		out_channels=1, activation_fn=None, in_layers=[self.gather])
		self.add_output(regression)
		else:
		vals = Dense(
		out_channels=self.n_tasks * 1, activation_fn=None, in_layers=[gather])
		vals = Reshape(shape=(-1, self.n_tasks, 1), in_layers=[vals])
		self.add_output(vals)
		labels = Label(shape=(None, self.n_tasks, 1))
		loss = ReduceSum(L2Loss(in_layers=[labels, vals]))

		label = Label(shape=(None, 1))
		self.labels_fd.append(label)
		cost = L2Loss(in_layers=[label, regression])
		costs.append(cost)
		if self.mode == "classification":
		all_cost = Stack(in_layers=costs, axis=1)
		elif self.mode == "regression":
		all_cost = Stack(in_layers=costs, axis=1)
		self.weights = Weights(shape=(None, self.n_tasks))
		loss = WeightedError(in_layers=[all_cost, self.weights])
		self.set_loss(loss)
		weights = Weights(shape=(None, self.n_tasks))
		weighted_loss = WeightedError(in_layers=[loss, weights])
		self.set_loss(weighted_loss)

		def default_generator(self,
		dataset,
		@@ -224,8 +214,6 @@ class TextCNNModel(TensorGraph):
		""" Transfer smiles strings to fixed length integer vectors
		"""
		for epoch in range(epochs):
		if not predict:
		print('Starting epoch %i' % epoch)
		for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
		batch_size=self.batch_size,
		deterministic=deterministic,
		@@ -233,18 +221,35 @@ class TextCNNModel(TensorGraph):

		feed_dict = dict()
		if y_b is not None and not predict:
		for index, label in enumerate(self.labels_fd):
		if self.mode == "classification":
		feed_dict[label] = to_one_hot(y_b[:, index])
		if self.mode == "regression":
		feed_dict[label] = y_b[:, index:index + 1]
		if w_b is not None:
		feed_dict[self.weights] = w_b
		# Transform SMILES string to integer vectors
		feed_dict[self.labels[0]] = to_one_hot(y_b.flatten(), 2).reshape(
		-1, self.n_tasks, 2)
		else:
		feed_dict[self.labels[0]] = y_b
		if w_b is not None and not predict:
		feed_dict[self.task_weights[0]] = w_b

		# Transform SMILES sequence to integers
		smiles_seqs = [self.smiles_to_seq(smiles) for smiles in ids_b]
		feed_dict[self.smiles_seqs] = np.stack(smiles_seqs, axis=0)
		feed_dict[self.smiles_seqs] = np.vstack(smiles_seqs)
		yield feed_dict

		def create_estimator_inputs(self, feature_columns, weight_column, features,
		labels, mode):
		"""Creates tensors for inputs."""
		tensors = dict()
		for layer, column in zip(self.features, feature_columns):
		tensors[layer] = tf.feature_column.input_layer(features, [column])
		if weight_column is not None:
		tensors[self.task_weights[0]] = tf.feature_column.input_layer(
		features, [weight_column])
		if labels is not None:
		if self.mode == "classification":
		tensors[self.labels[0]] = tf.one_hot(tf.cast(labels, tf.int32), 2)
		else:
		tensors[self.labels[0]] = labels
		return tensors

		def smiles_to_seq(self, smiles):
		""" Tokenize characters in smiles to integers
		"""
		@@ -268,7 +273,7 @@ class TextCNNModel(TensorGraph):
		for i in range(self.seq_length - len(seq)):
		# Padding with '_'
		seq.append(self.char_dict['_'])
		return np.array(seq)
		return np.array(seq, dtype=np.int32)

		def predict_on_generator(self, generator, transformers=[], outputs=None):
		out = super(TextCNNModel, self).predict_on_generator(

deepchem/models/tensorgraph/tests/test_estimators.py

+86 −0

Original line number	Diff line number	Diff line
		@@ -4,6 +4,7 @@ import tensorflow as tf
		import deepchem as dc
		import deepchem.models.tensorgraph.layers as layers
		from deepchem.data import NumpyDataset
		from deepchem.models.tensorgraph.models.text_cnn import default_dict


		class TestEstimators(unittest.TestCase):
		@@ -279,6 +280,91 @@ class TestEstimators(unittest.TestCase):
		results = estimator.evaluate(input_fn=lambda: input_fn(1))
		assert results['accuracy'] > 0.9

		def test_textcnn_classification(self):
		"""Test creating an Estimator from TextCNN for classification."""
		n_tasks = 2
		n_samples = 5

		# Create a TensorGraph model.
		seq_length = 20
		model = dc.models.TextCNNModel(
		n_tasks=n_tasks,
		char_dict=default_dict,
		seq_length=seq_length,
		kernel_sizes=[5, 5],
		num_filters=[20, 20])

		np.random.seed(123)
		smile_ids = ["CCCCC", "CCC(=O)O", "CCC", "CC(=O)O", "O=C=O"]
		X = [model.smiles_to_seq(smile) for smile in smile_ids]
		y = np.zeros((n_samples, n_tasks))
		w = np.ones((n_samples, n_tasks))
		dataset = NumpyDataset(X, y, w, smile_ids)

		def accuracy(labels, predictions, weights):
		return tf.metrics.accuracy(labels, tf.round(predictions), weights)

		def input_fn(epochs):
		x, y, weights = dataset.make_iterator(
		batch_size=n_samples, epochs=epochs).get_next()
		return {'x': x, 'weights': weights}, y

		# Create an estimator from it.
		x_col = tf.feature_column.numeric_column(
		'x', shape=(seq_length,), dtype=tf.int32)
		weight_col = tf.feature_column.numeric_column('weights', shape=(n_tasks,))
		metrics = {'accuracy': accuracy}
		estimator = model.make_estimator(
		feature_columns=[x_col], weight_column=weight_col, metrics=metrics)

		# Train the model.
		estimator.train(input_fn=lambda: input_fn(100))

		# Evaluate results
		results = estimator.evaluate(input_fn=lambda: input_fn(1))
		assert results['loss'] < 1e-2
		assert results['accuracy'] > 0.9

		def test_textcnn_regression(self):
		"""Test creating an Estimator from TextCNN for regression."""
		n_tasks = 2
		n_samples = 10

		# Create a TensorGraph model.
		seq_length = 20
		model = dc.models.TextCNNModel(
		n_tasks=n_tasks,
		char_dict=default_dict,
		seq_length=seq_length,
		kernel_sizes=[5, 5],
		num_filters=[20, 20],
		mode="regression")

		np.random.seed(123)
		smile_ids = ["CCCCC", "CCC(=O)O", "CCC", "CC(=O)O", "O=C=O"]
		X = [model.smiles_to_seq(smile) for smile in smile_ids]
		y = np.zeros((n_samples, n_tasks, 1), dtype=np.float32)
		w = np.ones((n_samples, n_tasks))
		dataset = NumpyDataset(X, y, w, smile_ids)

		def input_fn(epochs):
		x, y, weights = dataset.make_iterator(
		batch_size=n_samples, epochs=epochs).get_next()
		return {'x': x, 'weights': weights}, y

		# Create an estimator from it.
		x_col = tf.feature_column.numeric_column('x', shape=(seq_length,))
		weight_col = tf.feature_column.numeric_column('weights', shape=(n_tasks,))
		metrics = {'error': tf.metrics.mean_absolute_error}
		estimator = model.make_estimator(
		feature_columns=[x_col], weight_column=weight_col, metrics=metrics)

		# Train the model.
		estimator.train(input_fn=lambda: input_fn(100))
		results = estimator.evaluate(input_fn=lambda: input_fn(1))
		assert results['loss'] < 1e-1
		assert results['error'] < 0.1

		def test_scscore(self):
		"""Test creating an Estimator from a ScScoreModel."""
		n_samples = 10

Admin message