Merge remote-tracking branch 'remotes/origin/master' (de39f971) · Commits · 钟慕尧 / deepchem

deepchem/models/init.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -32,3 +32,4 @@ from deepchem.models.tensorgraph.models.graph_models import WeaveTensorGraph, DT
		from deepchem.models.tensorgraph.models.symmetry_function_regression import BPSymmetryFunctionRegression, ANIRegression

		from deepchem.models.tensorgraph.models.seqtoseq import SeqToSeq
		from deepchem.models.tensorgraph.models.text_cnn import TextCNNTensorGraph

deepchem/models/tensorgraph/layers.py

+106 −0

Original line number	Diff line number	Diff line
		@@ -480,6 +480,67 @@ class Dense(Layer):
		return self._shared_with._get_scope_name()


		class Highway(Layer):
		""" Create a highway layer. y = H(x) * T(x) + x * (1 - T(x))
		H(x) = activation_fn(matmul(W_H, x) + b_H) is the non-linear transformed output
		T(x) = sigmoid(matmul(W_T, x) + b_T) is the transform gate

		reference: https://arxiv.org/pdf/1505.00387.pdf

		This layer expects its input to be a two dimensional tensor of shape (batch size, # input features).
		Outputs will be in the same shape.
		"""

		def __init__(
		self,
		activation_fn=tf.nn.relu,
		biases_initializer=tf.zeros_initializer,
		weights_initializer=tf.contrib.layers.variance_scaling_initializer,
		**kwargs):
		"""

		Parameters
		----------
		activation_fn: object
		the Tensorflow activation function to apply to the output
		biases_initializer: callable object
		the initializer for bias values. This may be None, in which case the layer
		will not include biases.
		weights_initializer: callable object
		the initializer for weight values
		"""
		super(Highway, self).__init__(**kwargs)
		self.activation_fn = activation_fn
		self.biases_initializer = biases_initializer
		self.weights_initializer = weights_initializer

		def create_tensor(self, in_layers=None, set_tensors=True, **kwargs):
		inputs = self._get_input_tensors(in_layers)
		parent = inputs[0]
		shape = parent.get_shape().as_list()[1]
		# H(x), with same number of input and output channels
		dense_H = tf.contrib.layers.fully_connected(
		parent,
		num_outputs=shape,
		activation_fn=self.activation_fn,
		biases_initializer=self.biases_initializer(),
		weights_initializer=self.weights_initializer(),
		trainable=True)
		# T(x), with same number of input and output channels
		dense_T = tf.contrib.layers.fully_connected(
		parent,
		num_outputs=shape,
		activation_fn=tf.nn.sigmoid,
		biases_initializer=tf.constant_initializer(-1),
		weights_initializer=self.weights_initializer(),
		trainable=True)
		out_tensor = tf.multiply(dense_H, dense_T) + tf.multiply(
		parent, 1 - dense_T)
		if set_tensors:
		self.out_tensor = out_tensor
		return out_tensor


		class Flatten(Layer):
		"""Flatten every dimension except the first"""

		@@ -1516,6 +1577,51 @@ class Conv3D(Layer):
		return out_tensor


		class MaxPool1D(Layer):
		"""A 1D max pooling on the input.

		This layer expects its input to be a three dimensional tensor of shape
		(batch size, width, # channels).
		"""

		def __init__(self, window_shape=2, strides=1, padding="SAME", **kwargs):
		"""Create a MaxPool1D layer.

		Parameters
		----------
		window_shape: int, optional
		size of the window(assuming input with only one dimension)
		strides: int, optional
		stride of the sliding window
		padding: str
		the padding method to use, either 'SAME' or 'VALID'
		"""
		self.window_shape = window_shape
		self.strides = strides
		self.padding = padding
		self.pooling_type = "MAX"
		super(MaxPool1D, self).__init__(**kwargs)
		try:
		parent_shape = self.in_layers[0].shape
		self._shape = tuple(None if p is None else p // s
		for p, s in zip(parent_shape, strides))
		except:
		pass

		def create_tensor(self, in_layers=None, set_tensors=True, **kwargs):
		inputs = self._get_input_tensors(in_layers)
		in_tensor = inputs[0]
		out_tensor = tf.nn.pool(
		in_tensor,
		window_shape=[self.window_shape],
		pooling_type=self.pooling_type,
		padding=self.padding,
		strides=[self.strides])
		if set_tensors:
		self.out_tensor = out_tensor
		return out_tensor


		class MaxPool2D(Layer):

		def __init__(self,

deepchem/models/tensorgraph/models/text_cnn.py

0 → 100644

+274 −0

Original line number	Diff line number	Diff line
		"""
		Created on Thu Sep 28 15:17:50 2017

		@author: zqwu
		"""
		import numpy as np
		import tensorflow as tf
		import copy

		from deepchem.metrics import to_one_hot, from_one_hot
		from deepchem.models.tensorgraph.layers import Dense, Concat, SoftMax, \
		SoftMaxCrossEntropy, BatchNorm, WeightedError, Dropout, BatchNormalization, \
		Conv1D, MaxPool1D, Squeeze, Stack, Highway
		from deepchem.models.tensorgraph.graph_layers import DTNNEmbedding

		from deepchem.models.tensorgraph.layers import L2Loss, Label, Weights, Feature
		from deepchem.models.tensorgraph.tensor_graph import TensorGraph
		from deepchem.trans import undo_transforms

		# Common symbols in SMILES, note that Cl and Br are regarded as single symbol
		default_dict = {
		'#': 1,
		'(': 2,
		')': 3,
		'+': 4,
		'-': 5,
		'/': 6,
		'1': 7,
		'2': 8,
		'3': 9,
		'4': 10,
		'5': 11,
		'6': 12,
		'7': 13,
		'8': 14,
		'=': 15,
		'C': 16,
		'F': 17,
		'H': 18,
		'I': 19,
		'N': 20,
		'O': 21,
		'P': 22,
		'S': 23,
		'[': 24,
		'\\': 25,
		']': 26,
		'_': 27,
		'c': 28,
		'Cl': 29,
		'Br': 30,
		'n': 31,
		'o': 32,
		's': 33
		}


		class TextCNNTensorGraph(TensorGraph):
		""" A Convolutional neural network on smiles strings
		Reimplementation of the discriminator module in ORGAN: https://arxiv.org/abs/1705.10843
		Originated from: http://emnlp2014.org/papers/pdf/EMNLP2014181.pdf

		This model applies multiple 1D convolutional filters to the padded strings,
		then max-over-time pooling is applied on all filters, extracting one feature per filter.
		All features are concatenated and transformed through several hidden layers to form predictions.

		This model is initially developed for sentence-level classification tasks, with
		words represented as vectors. In this implementation, SMILES strings are dissected
		into characters and transformed to one-hot vectors in a similar way. The model can
		be used for general molecular-level classification or regression tasks. It is also
		used in the ORGAN model as discriminator.

		Training of the model only requires SMILES strings input, all featurized datasets
		that include SMILES in the `ids` attribute are accepted. PDBbind, QM7 and QM7b
		are not supported. To use the model, `build_char_dict` should be called first
		before defining the model to build character dict of input dataset, example can
		be found in examples/delaney/delaney_textcnn.py

		"""

		def __init__(
		self,
		n_tasks,
		char_dict,
		seq_length,
		n_embedding=75,
		filter_sizes=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20],
		num_filters=[100, 200, 200, 200, 200, 100, 100, 100, 100, 100, 160, 160],
		dropout=0.25,
		mode="classification",
		**kwargs):
		"""
		Parameters
		----------
		n_tasks: int
		Number of tasks
		char_dict: dict
		Mapping from characters in smiles to integers
		seq_length: int
		Length of sequences(after padding)
		n_embedding: int, optional
		Length of embedding vector
		filter_sizes: list of int, optional
		Properties of filters used in the conv net
		num_filters: list of int, optional
		Properties of filters used in the conv net
		mode: str
		Either "classification" or "regression" for type of model.
		"""
		self.n_tasks = n_tasks
		self.char_dict = char_dict
		self.seq_length = seq_length
		self.n_embedding = n_embedding
		self.filter_sizes = filter_sizes
		self.num_filters = num_filters
		self.dropout = dropout
		self.mode = mode
		super(TextCNNTensorGraph, self).__init__(**kwargs)
		self.build_graph()

		@staticmethod
		def build_char_dict(dataset, default_dict=default_dict):
		""" Collect all unique characters(in smiles) from the dataset.
		This method should be called before defining the model to build appropriate char_dict
		"""
		# SMILES strings
		X = dataset.ids
		# Maximum length is expanded to allow length variation during train and inference
		seq_length = int(max([len(smile) for smile in X]) * 1.2)
		# '_' served as delimiter and padding
		all_smiles = '_'.join(X)
		tot_len = len(all_smiles)
		# Initialize common characters as keys
		keys = list(default_dict.keys())
		out_dict = copy.deepcopy(default_dict)
		current_key_val = len(keys) + 1
		# Include space to avoid extra keys
		keys.extend([' '])
		extra_keys = []
		i = 0
		while i < tot_len:
		# For 'Cl', 'Br', etc.
		if all_smiles[i:i + 2] in keys:
		i = i + 2
		elif all_smiles[i:i + 1] in keys:
		i = i + 1
		else:
		# Character not recognized, add to extra_keys
		extra_keys.append(all_smiles[i])
		keys.append(all_smiles[i])
		i = i + 1
		# Add all extra_keys to char_dict
		for extra_key in extra_keys:
		out_dict[extra_key] = current_key_val
		current_key_val += 1
		return out_dict, seq_length

		def build_graph(self):
		self.smiles_seqs = Feature(shape=(None, self.seq_length), dtype=tf.int32)
		# Character embedding
		self.Embedding = DTNNEmbedding(
		n_embedding=self.n_embedding,
		periodic_table_length=len(self.char_dict.keys()) + 1,
		in_layers=[self.smiles_seqs])
		self.pooled_outputs = []
		self.conv_layers = []
		for filter_size, num_filter in zip(self.filter_sizes, self.num_filters):
		# Multiple convolutional layers with different filter widths
		self.conv_layers.append(
		Conv1D(
		filter_size,
		num_filter,
		padding='VALID',
		in_layers=[self.Embedding]))
		# Max-over-time pooling
		self.pooled_outputs.append(
		MaxPool1D(
		window_shape=self.seq_length - filter_size + 1,
		strides=1,
		padding='VALID',
		in_layers=[self.conv_layers[-1]]))
		# Concat features from all filters(one feature per filter)
		concat_outputs = Concat(axis=2, in_layers=self.pooled_outputs)
		outputs = Squeeze(squeeze_dims=1, in_layers=concat_outputs)
		dropout = Dropout(dropout_prob=self.dropout, in_layers=[outputs])
		dense = Dense(
		out_channels=200, activation_fn=tf.nn.relu, in_layers=[dropout])
		# Highway layer from https://arxiv.org/pdf/1505.00387.pdf
		self.gather = Highway(in_layers=[dense])

		costs = []
		self.labels_fd = []
		for task in range(self.n_tasks):
		if self.mode == "classification":
		classification = Dense(
		out_channels=2, activation_fn=None, in_layers=[self.gather])
		softmax = SoftMax(in_layers=[classification])
		self.add_output(softmax)

		label = Label(shape=(None, 2))
		self.labels_fd.append(label)
		cost = SoftMaxCrossEntropy(in_layers=[label, classification])
		costs.append(cost)
		if self.mode == "regression":
		regression = Dense(
		out_channels=1, activation_fn=None, in_layers=[self.gather])
		self.add_output(regression)

		label = Label(shape=(None, 1))
		self.labels_fd.append(label)
		cost = L2Loss(in_layers=[label, regression])
		costs.append(cost)
		if self.mode == "classification":
		all_cost = Concat(in_layers=costs, axis=1)
		elif self.mode == "regression":
		all_cost = Stack(in_layers=costs, axis=1)
		self.weights = Weights(shape=(None, self.n_tasks))
		loss = WeightedError(in_layers=[all_cost, self.weights])
		self.set_loss(loss)

		def default_generator(self,
		dataset,
		epochs=1,
		predict=False,
		deterministic=True,
		pad_batches=True):
		""" Transfer smiles strings to fixed length integer vectors
		"""
		for epoch in range(epochs):
		if not predict:
		print('Starting epoch %i' % epoch)
		for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
		batch_size=self.batch_size,
		deterministic=deterministic,
		pad_batches=pad_batches):

		feed_dict = dict()
		if y_b is not None and not predict:
		for index, label in enumerate(self.labels_fd):
		if self.mode == "classification":
		feed_dict[label] = to_one_hot(y_b[:, index])
		if self.mode == "regression":
		feed_dict[label] = y_b[:, index:index + 1]
		if w_b is not None:
		feed_dict[self.weights] = w_b
		# Transform SMILES string to integer vectors
		smiles_seqs = [self.smiles_to_seq(smiles) for smiles in ids_b]
		feed_dict[self.smiles_seqs] = np.stack(smiles_seqs, axis=0)
		yield feed_dict

		def smiles_to_seq(self, smiles):
		""" Tokenize characters in smiles to integers
		"""
		smiles_len = len(smiles)
		seq = [0]
		keys = self.char_dict.keys()
		i = 0
		while i < smiles_len:
		# Skip all spaces
		if smiles[i:i + 1] == ' ':
		i = i + 1
		# For 'Cl', 'Br', etc.
		elif smiles[i:i + 2] in keys:
		seq.append(self.char_dict[smiles[i:i + 2]])
		i = i + 2
		elif smiles[i:i + 1] in keys:
		seq.append(self.char_dict[smiles[i:i + 1]])
		i = i + 1
		else:
		raise ValueError('character not found in dict')
		for i in range(self.seq_length - len(seq)):
		# Padding with '_'
		seq.append(self.char_dict['_'])
		return np.array(seq)

deepchem/models/tests/test_overfit.py

+72 −0

Original line number	Diff line number	Diff line
		@@ -1380,6 +1380,78 @@ class TestOverfit(test_util.TensorFlowTestCase):

		assert scores[regression_metric.name] > .8

		def test_textCNN_singletask_classification_overfit(self):
		"""Test textCNN model overfits tiny data."""
		np.random.seed(123)
		tf.set_random_seed(123)
		n_tasks = 1

		featurizer = dc.feat.RawFeaturizer()
		tasks = ["outcome"]
		input_file = os.path.join(self.current_dir, "example_classification.csv")
		loader = dc.data.CSVLoader(
		tasks=tasks, smiles_field="smiles", featurizer=featurizer)
		dataset = loader.featurize(input_file)

		classification_metric = dc.metrics.Metric(dc.metrics.accuracy_score)

		char_dict, length = dc.models.TextCNNTensorGraph.build_char_dict(dataset)
		batch_size = 10

		model = dc.models.TextCNNTensorGraph(
		n_tasks,
		char_dict,
		seq_length=length,
		batch_size=batch_size,
		learning_rate=0.001,
		use_queue=False,
		mode="classification")

		# Fit trained model
		model.fit(dataset, nb_epoch=200)

		# Eval model on train
		scores = model.evaluate(dataset, [classification_metric])

		assert scores[classification_metric.name] > .8

		def test_textCNN_singletask_regression_overfit(self):
		"""Test textCNN model overfits tiny data."""
		np.random.seed(123)
		tf.set_random_seed(123)
		n_tasks = 1

		# Load mini log-solubility dataset.
		featurizer = dc.feat.RawFeaturizer()
		tasks = ["outcome"]
		input_file = os.path.join(self.current_dir, "example_regression.csv")
		loader = dc.data.CSVLoader(
		tasks=tasks, smiles_field="smiles", featurizer=featurizer)
		dataset = loader.featurize(input_file)

		regression_metric = dc.metrics.Metric(
		dc.metrics.pearson_r2_score, task_averager=np.mean)

		char_dict, length = dc.models.TextCNNTensorGraph.build_char_dict(dataset)
		batch_size = 10

		model = dc.models.TextCNNTensorGraph(
		n_tasks,
		char_dict,
		seq_length=length,
		batch_size=batch_size,
		learning_rate=0.001,
		use_queue=False,
		mode="regression")

		# Fit trained model
		model.fit(dataset, nb_epoch=200)

		# Eval model on train
		scores = model.evaluate(dataset, [regression_metric])

		assert scores[regression_metric.name] > .9

		def test_siamese_singletask_classification_overfit(self):
		"""Test siamese singletask model overfits tiny data."""
		np.random.seed(123)

examples/delaney/delaney_textcnn.py

0 → 100644

+47 −0

Original line number	Diff line number	Diff line
		"""
		Script that trains textCNN models on delaney dataset.
		"""
		from __future__ import print_function
		from __future__ import division
		from __future__ import unicode_literals

		import numpy as np
		np.random.seed(123)
		import tensorflow as tf
		tf.set_random_seed(123)
		import deepchem as dc

		# Load Delaney dataset
		delaney_tasks, delaney_datasets, transformers = dc.molnet.load_delaney(
		featurizer='Raw', split='index')
		train_dataset, valid_dataset, test_dataset = delaney_datasets

		# Fit models
		metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)

		char_dict, length = dc.models.TextCNNTensorGraph.build_char_dict(train_dataset)

		# Batch size of models
		batch_size = 64

		model = dc.models.TextCNNTensorGraph(
		len(delaney_tasks),
		char_dict,
		seq_length=length,
		mode='regression',
		learning_rate=1e-3,
		batch_size=batch_size,
		use_queue=False)

		# Fit trained model
		model.fit(train_dataset, nb_epoch=100)

		print("Evaluating model")
		train_scores = model.evaluate(train_dataset, [metric], transformers)
		valid_scores = model.evaluate(valid_dataset, [metric], transformers)

		print("Train scores")
		print(train_scores)

		print("Validation scores")
		print(valid_scores)

Admin message