Merge remote-tracking branch 'remotes/origin/master' into temp3 (b258a39b) · Commits · 钟慕尧 / deepchem

deepchem/data/datasets.py

+7 −1

Original line number	Diff line number	Diff line
		@@ -14,6 +14,7 @@ from deepchem.utils.save import log
		import tempfile
		import time
		import shutil
		from multiprocessing.dummy import Pool

		__author__ = "Bharath Ramsundar"
		__copyright__ = "Copyright 2016, Stanford University"
		@@ -643,8 +644,12 @@ class DiskDataset(Dataset):
		shard_perm = np.random.permutation(num_shards)
		else:
		shard_perm = np.arange(num_shards)
		pool = Pool(1)
		next_shard = pool.apply_async(dataset.get_shard, (shard_perm[0],))
		for i in range(num_shards):
		X, y, w, ids = dataset.get_shard(shard_perm[i])
		X, y, w, ids = next_shard.get()
		if i < num_shards - 1:
		next_shard = pool.apply_async(dataset.get_shard, (shard_perm[i + 1],))
		n_samples = X.shape[0]
		# TODO(rbharath): This happens in tests sometimes, but don't understand why?
		# Handle edge case.
		@@ -683,6 +688,7 @@ class DiskDataset(Dataset):
		(X_batch, y_batch, w_batch, ids_batch) = pad_batch(
		shard_batch_size, X_batch, y_batch, w_batch, ids_batch)
		yield (X_batch, y_batch, w_batch, ids_batch)
		pool.close()

		return iterate(self)

deepchem/models/init.py

+2 −0

Original line number	Diff line number	Diff line
		@@ -30,3 +30,5 @@ from deepchem.models.tensorflow_models.IRV import TensorflowMultiTaskIRVClassifi
		from deepchem.models.tensorgraph.tensor_graph import TensorGraph
		from deepchem.models.tensorgraph.models.graph_models import WeaveTensorGraph, DTNNTensorGraph, DAGTensorGraph, GraphConvTensorGraph, MPNNTensorGraph
		from deepchem.models.tensorgraph.models.symmetry_function_regression import BPSymmetryFunctionRegression, ANIRegression

		from deepchem.models.tensorgraph.models.seqtoseq import SeqToSeq

deepchem/models/tensorgraph/layers.py

+42 −1

Original line number	Diff line number	Diff line
		@@ -504,9 +504,28 @@ class Transpose(Layer):


		class CombineMeanStd(Layer):
		"""Generate Gaussian nose."""

		def __init__(self, in_layers=None, **kwargs):
		def __init__(self, in_layers=None, training_only=False, **kwargs):
		"""Create a CombineMeanStd layer.

		This layer should have two inputs with the same shape, and its output also has the
		same shape. Each element of the output is a Gaussian distributed random number
		whose mean is the corresponding element of the first input, and whose standard
		deviation is the corresponding element of the second input.

		Parameters
		----------
		in_layers: list
		the input layers. The first one specifies the mean, and the second one specifies
		the standard deviation.
		training_only: bool
		if True, noise is only generated during training. During prediction, the output
		is simply equal to the first input (that is, the mean of the distribution used
		during training).
		"""
		super(CombineMeanStd, self).__init__(in_layers, **kwargs)
		self.training_only = training_only
		try:
		self._shape = self.in_layers[0].shape
		except:
		@@ -519,6 +538,8 @@ class CombineMeanStd(Layer):
		mean_parent, std_parent = inputs[0], inputs[1]
		sample_noise = tf.random_normal(
		mean_parent.get_shape(), 0, 1, dtype=tf.float32)
		if self.training_only:
		sample_noise *= kwargs['training']
		out_tensor = mean_parent + (std_parent * sample_noise)
		if set_tensors:
		self.out_tensor = out_tensor
		@@ -996,6 +1017,26 @@ class Log(Layer):
		return out_tensor


		class Exp(Layer):
		"""Compute the exponential of the input."""

		def __init__(self, in_layers=None, **kwargs):
		super(Exp, self).__init__(in_layers, **kwargs)
		try:
		self._shape = self.in_layers[0].shape
		except:
		pass

		def create_tensor(self, in_layers=None, set_tensors=True, **kwargs):
		inputs = self._get_input_tensors(in_layers)
		if len(inputs) != 1:
		raise ValueError('Exp must have a single parent')
		out_tensor = tf.exp(inputs[0])
		if set_tensors:
		self.out_tensor = out_tensor
		return out_tensor


		class InteratomicL2Distances(Layer):
		"""Compute (squared) L2 Distances between atoms given neighbors."""

deepchem/models/tensorgraph/models/graph_models.py

+20 −0

Original line number	Diff line number	Diff line
		@@ -669,6 +669,26 @@ class GraphConvTensorGraph(TensorGraph):

		return mu[:max_index + 1], sigma[:max_index + 1]

		def bayesian_predict_on_batch(self, X, transformers=[], n_passes=4):
		"""
		Returns:
		mu: numpy ndarray of shape (n_samples, n_tasks)
		sigma: numpy ndarray of shape (n_samples, n_tasks)
		"""
		dataset = NumpyDataset(X=X, y=None, n_tasks=len(self.outputs))
		y_ = []
		for i in range(n_passes):
		generator = self.default_generator(
		dataset, predict=True, pad_batches=True)
		y_.append(self.predict_on_generator(generator, transformers))

		# Concatenates along 0-th dimension
		y_ = np.array(y_)
		mu = np.mean(y_, axis=0)
		sigma = np.std(y_, axis=0)

		return mu, sigma

		def predict_on_smiles(self, smiles, transformers=[], untransform=False):
		"""Generates predictions on a numpy array of smile strings

deepchem/models/tensorgraph/models/seqtoseq.py

0 → 100644

+401 −0

Original line number	Diff line number	Diff line
		"""Sequence to sequence translation models."""

		from deepchem.models import TensorGraph
		from deepchem.models.tensorgraph import layers
		from heapq import heappush, heappushpop
		import numpy as np
		import tensorflow as tf


		class SeqToSeq(TensorGraph):
		"""Implements sequence to sequence translation models.

		The model is based on the description in Sutskever et al., "Sequence to
		Sequence Learning with Neural Networks" (https://arxiv.org/abs/1409.3215),
		although this implementation uses GRUs instead of LSTMs. The goal is to
		take sequences of tokens as input, and translate each one into a different
		output sequence. The input and output sequences can both be of variable
		length, and an output sequence need not have the same length as the input
		sequence it was generated from. For example, these models were originally
		developed for use in natural language processing. In that context, the
		input might be a sequence of English words, and the output might be a
		sequence of French words. The goal would be to train the model to translate
		sentences from English to French.

		The model consists of two parts called the "encoder" and "decoder". Each one
		consists of a stack of recurrent layers. The job of the encoder is to
		transform the input sequence into a single, fixed length vector called the
		"embedding". That vector contains all relevant information from the input
		sequence. The decoder then transforms the embedding vector into the output
		sequence.

		These models can be used for various purposes. First and most obviously,
		they can be used for sequence to sequence translation. In any case where you
		have sequences of tokens, and you want to translate each one into a different
		sequence, a SeqToSeq model can be trained to perform the translation.

		Another possible use case is transforming variable length sequences into
		fixed length vectors. Many types of models require their inputs to have a
		fixed shape, which makes it difficult to use them with variable sized inputs
		(for example, when the input is a molecule, and different molecules have
		different numbers of atoms). In that case, you can train a SeqToSeq model as
		an autoencoder, so that it tries to make the output sequence identical to the
		input one. That forces the embedding vector to contain all information from
		the original sequence. You can then use the encoder for transforming
		sequences into fixed length embedding vectors, suitable to use as inputs to
		other types of models.

		Another use case is to train the decoder for use as a generative model. Here
		again you begin by training the SeqToSeq model as an autoencoder. Once
		training is complete, you can supply arbitrary embedding vectors, and
		transform each one into an output sequence. When used in this way, you
		typically train it as a variational autoencoder. This adds random noise to
		the encoder, and also adds a constraint term to the loss that forces the
		embedding vector to have a unit Gaussian distribution. You can then pick
		random vectors from a Gaussian distribution, and the output sequences should
		follow the same distribution as the training data.

		When training as a variational autoencoder, it is best to use KL cost
		annealing, as described in https://arxiv.org/abs/1511.06349. The constraint
		term in the loss is initially set to 0, so the optimizer just tries to
		minimize the reconstruction loss. Once it has made reasonable progress
		toward that, the constraint term can be gradually turned back on. The range
		of steps over which this happens is configurable.
		"""

		sequence_end = object()

		def __init__(self,
		input_tokens,
		output_tokens,
		max_output_length,
		encoder_layers=4,
		decoder_layers=4,
		embedding_dimension=512,
		dropout=0.0,
		reverse_input=True,
		variational=False,
		annealing_start_step=5000,
		annealing_final_step=10000,
		**kwargs):
		"""Construct a SeqToSeq model.

		In addition to the following arguments, this class also accepts all the keyword arguments
		from TensorGraph.

		Parameters
		----------
		input_tokens: list
		a list of all tokens that may appear in input sequences
		output_tokens: list
		a list of all tokens that may appear in output sequences
		max_output_length: int
		the maximum length of output sequence that may be generated
		encoder_layers: int
		the number of recurrent layers in the encoder
		decoder_layers: int
		the number of recurrent layers in the decoder
		embedding_dimension: int
		the width of the embedding vector. This also is the width of all
		recurrent layers.
		dropout: float
		the dropout probability to use during training
		reverse_input: bool
		if True, reverse the order of input sequences before sending them into
		the encoder. This can improve performance when working with long sequences.
		variational: bool
		if True, train the model as a variational autoencoder. This adds random
		noise to the encoder, and also constrains the embedding to follow a unit
		Gaussian distribution.
		annealing_start_step: int
		the step (that is, batch) at which to begin turning on the constraint term
		for KL cost annealing
		annealing_final_step: int
		the step (that is, batch) at which to finish turning on the constraint term
		for KL cost annealing
		"""
		super(SeqToSeq, self).__init__(
		use_queue=False, **kwargs) # TODO can we make it work with the queue?
		if SeqToSeq.sequence_end not in input_tokens:
		input_tokens = input_tokens + [SeqToSeq.sequence_end]
		if SeqToSeq.sequence_end not in output_tokens:
		output_tokens = output_tokens + [SeqToSeq.sequence_end]
		self._input_tokens = input_tokens
		self._output_tokens = output_tokens
		self._input_dict = dict((x, i) for i, x in enumerate(input_tokens))
		self._output_dict = dict((x, i) for i, x in enumerate(output_tokens))
		self._max_output_length = max_output_length
		self._embedding_dimension = embedding_dimension
		self._annealing_final_step = annealing_final_step
		self._annealing_start_step = annealing_start_step
		self._features = layers.Feature(shape=(None, None, len(input_tokens)))
		self._labels = layers.Label(shape=(None, None, len(output_tokens)))
		self._gather_indices = layers.Feature(
		shape=(self.batch_size, 2), dtype=tf.int32)
		self._reverse_input = reverse_input
		self._variational = variational
		self.embedding = self._create_encoder(encoder_layers, dropout)
		self.output = self._create_decoder(decoder_layers, dropout)
		self.set_loss(self._create_loss())
		self.add_output(self.output)

		def _create_encoder(self, n_layers, dropout):
		"""Create the encoder layers."""
		prev_layer = self._features
		for i in range(n_layers):
		if dropout > 0.0:
		prev_layer = layers.Dropout(dropout, in_layers=prev_layer)
		prev_layer = layers.GRU(
		self._embedding_dimension, self.batch_size, in_layers=prev_layer)
		prev_layer = layers.Gather(in_layers=[prev_layer, self._gather_indices])
		if self._variational:
		self._embedding_mean = layers.Dense(
		self._embedding_dimension, in_layers=prev_layer)
		self._embedding_stddev = layers.Dense(
		self._embedding_dimension, in_layers=prev_layer)
		prev_layer = layers.CombineMeanStd(
		[self._embedding_mean, self._embedding_stddev], training_only=True)
		return prev_layer

		def _create_decoder(self, n_layers, dropout):
		"""Create the decoder layers."""
		prev_layer = layers.Repeat(
		self._max_output_length, in_layers=self.embedding)
		for i in range(n_layers):
		if dropout > 0.0:
		prev_layer = layers.Dropout(dropout, in_layers=prev_layer)
		prev_layer = layers.GRU(
		self._embedding_dimension, self.batch_size, in_layers=prev_layer)
		return layers.Dense(
		len(self._output_tokens),
		in_layers=prev_layer,
		activation_fn=tf.nn.softmax)

		def _create_loss(self):
		"""Create the loss function."""
		prob = layers.ReduceSum(self.output * self._labels, axis=2)
		mask = layers.ReduceSum(self._labels, axis=2)
		log_prob = layers.Log(prob + 1e-20) * mask
		loss = -layers.ReduceMean(layers.ReduceSum(log_prob, axis=1))
		if self._variational:
		mean_sq = self._embedding_mean * self._embedding_mean
		stddev_sq = self._embedding_stddev * self._embedding_stddev
		kl = mean_sq + stddev_sq - layers.Log(stddev_sq) - 1
		anneal_steps = self._annealing_final_step - self._annealing_start_step
		if anneal_steps > 0:
		current_step = tf.to_float(
		self.get_global_step()) - self._annealing_start_step
		anneal_frac = tf.maximum(0.0, current_step) / anneal_steps
		kl_scale = layers.TensorWrapper(
		tf.minimum(1.0, anneal_frac * anneal_frac))
		else:
		kl_scale = 1.0
		loss += 0.5 * kl_scale * layers.ReduceMean(layers.ReduceSum(kl, axis=1))
		return loss

		def fit_sequences(self,
		sequences,
		max_checkpoints_to_keep=5,
		checkpoint_interval=1000,
		restore=False):
		"""Train this model on a set of sequences

		Parameters
		----------
		sequences: iterable
		the training samples to fit to. Each sample should be
		represented as a tuple of the form (input_sequence, output_sequence).
		max_checkpoints_to_keep: int
		the maximum number of checkpoints to keep. Older checkpoints are discarded.
		checkpoint_interval: int
		the frequency at which to write checkpoints, measured in training steps.
		restore: bool
		if True, restore the model from the most recent checkpoint and continue training
		from there. If False, retrain the model from scratch.
		"""
		self.fit_generator(
		self._generate_batches(sequences),
		max_checkpoints_to_keep=max_checkpoints_to_keep,
		checkpoint_interval=checkpoint_interval,
		restore=restore)

		def predict_from_sequences(self, sequences, beam_width=5):
		"""Given a set of input sequences, predict the output sequences.

		The prediction is done using a beam search with length normalization.

		Parameters
		----------
		sequences: iterable
		the input sequences to generate a prediction for
		beam_width: int
		the beam width to use for searching. Set to 1 to use a simple greedy search.
		"""
		result = []
		with self._get_tf("Graph").as_default():
		for batch in self._batch_elements(sequences):
		feed_dict = {}
		feed_dict[self._features] = self._create_input_array(batch)
		feed_dict[self._gather_indices] = [(i, len(batch[i])
		if i < len(batch) else 0)
		for i in range(self.batch_size)]
		feed_dict[self._training_placeholder] = 0.0
		for initial, zero in zip(self.rnn_initial_states, self.rnn_zero_states):
		feed_dict[initial] = zero
		probs = self.session.run(self.output, feed_dict=feed_dict)
		for i in range(len(batch)):
		result.append(self._beam_search(probs[i], beam_width))
		return result

		def predict_from_embeddings(self, embeddings, beam_width=5):
		"""Given a set of embedding vectors, predict the output sequences.

		The prediction is done using a beam search with length normalization.

		Parameters
		----------
		embeddings: iterable
		the embedding vectors to generate predictions for
		beam_width: int
		the beam width to use for searching. Set to 1 to use a simple greedy search.
		"""
		result = []
		with self._get_tf("Graph").as_default():
		for batch in self._batch_elements(embeddings):
		embedding_array = np.zeros(
		(self.batch_size, self._embedding_dimension), dtype=np.float32)
		for i, e in enumerate(batch):
		embedding_array[i] = e
		feed_dict = {}
		feed_dict[self.embedding] = embedding_array
		feed_dict[self._training_placeholder] = 0.0
		for initial, zero in zip(self.rnn_initial_states, self.rnn_zero_states):
		feed_dict[initial] = zero
		probs = self.session.run(self.output, feed_dict=feed_dict)
		for i in range(len(batch)):
		result.append(self._beam_search(probs[i], beam_width))
		return result

		def predict_embeddings(self, sequences):
		"""Given a set of input sequences, compute the embedding vectors.

		Parameters
		----------
		sequences: iterable
		the input sequences to generate an embedding vector for
		"""
		result = []
		with self._get_tf("Graph").as_default():
		for batch in self._batch_elements(sequences):
		feed_dict = {}
		feed_dict[self._features] = self._create_input_array(batch)
		feed_dict[self._gather_indices] = [(i, len(batch[i])
		if i < len(batch) else 0)
		for i in range(self.batch_size)]
		feed_dict[self._training_placeholder] = 0.0
		for initial, zero in zip(self.rnn_initial_states, self.rnn_zero_states):
		feed_dict[initial] = zero
		embeddings = self.session.run(self.embedding, feed_dict=feed_dict)
		for i in range(len(batch)):
		result.append(embeddings[i])
		return np.array(result, dtype=np.float32)

		def _beam_search(self, probs, beam_width):
		"""Perform a beam search for the most likely output sequence."""
		if beam_width == 1:
		# Do a simple greedy search.

		s = []
		for i in range(len(probs)):
		token = self._output_tokens[np.argmax(probs[i])]
		if token == SeqToSeq.sequence_end:
		break
		s.append(token)
		return s

		# Do a beam search with length normalization.

		logprobs = np.log(probs)
		# Represent each candidate as (normalized prob, raw prob, sequence)
		candidates = [(0.0, 0.0, [])]
		for i in range(len(logprobs)):
		new_candidates = []
		for c in candidates:
		if len(c[2]) > 0 and c[2][-1] == SeqToSeq.sequence_end:
		# This candidate sequence has already been terminated
		if len(new_candidates) < beam_width:
		heappush(new_candidates, c)
		else:
		heappushpop(new_candidates, c)
		else:
		# Consider all possible tokens we could add to this candidate sequence.
		for j, logprob in enumerate(logprobs[i]):
		new_logprob = logprob + c[1]
		newc = (new_logprob / (len(c[2]) + 1), new_logprob,
		c[2] + [self._output_tokens[j]])
		if len(new_candidates) < beam_width:
		heappush(new_candidates, newc)
		else:
		heappushpop(new_candidates, newc)
		candidates = new_candidates
		return sorted(candidates)[-1][2][:-1]

		def _create_input_array(self, sequences):
		"""Create the array describing the input sequences for a batch."""
		lengths = [len(x) for x in sequences]
		if self._reverse_input:
		sequences = [reversed(s) for s in sequences]
		features = np.zeros(
		(self.batch_size, max(lengths) + 1, len(self._input_tokens)),
		dtype=np.float32)
		for i, sequence in enumerate(sequences):
		for j, token in enumerate(sequence):
		features[i, j, self._input_dict[token]] = 1
		features[np.arange(len(sequences)), lengths, self._input_dict[
		SeqToSeq.sequence_end]] = 1
		return features

		def _create_output_array(self, sequences):
		"""Create the array describing the target sequences for a batch."""
		lengths = [len(x) for x in sequences]
		labels = np.zeros(
		(self.batch_size, self._max_output_length, len(self._output_tokens)),
		dtype=np.float32)
		end_marker_index = self._output_dict[SeqToSeq.sequence_end]
		for i, sequence in enumerate(sequences):
		for j, token in enumerate(sequence):
		labels[i, j, self._output_dict[token]] = 1
		if lengths[i] < self._max_output_length:
		labels[i, lengths[i], end_marker_index] = 1
		return labels

		def _batch_elements(self, elements):
		"""Combine elements into batches."""
		batch = []
		for s in elements:
		batch.append(s)
		if len(batch) == self.batch_size:
		yield batch
		batch = []
		if len(batch) > 0:
		yield batch

		def _generate_batches(self, sequences):
		"""Create feed_dicts for fitting."""
		for batch in self._batch_elements(sequences):
		inputs = []
		outputs = []
		for input, output in batch:
		inputs.append(input)
		outputs.append(output)
		for i in range(len(inputs), self.batch_size):
		inputs.append([])
		outputs.append([])
		feed_dict = {}
		feed_dict[self._features] = self._create_input_array(inputs)
		feed_dict[self._labels] = self._create_output_array(outputs)
		feed_dict[self._gather_indices] = [(i, len(x))
		for i, x in enumerate(inputs)]
		for initial, zero in zip(self.rnn_initial_states, self.rnn_zero_states):
		feed_dict[initial] = zero
		yield feed_dict

Admin message