Merge pull request #1106 from peastman/estimator (212b325d) · Commits · 钟慕尧 / deepchem

deepchem/models/tensorgraph/fcnet.py

+17 −23

Original line number	Diff line number	Diff line
		@@ -18,7 +18,7 @@ from deepchem.metrics import to_one_hot, from_one_hot
		from deepchem.metrics import to_one_hot

		from deepchem.models.tensorgraph.tensor_graph import TensorGraph, TFWrapper
		from deepchem.models.tensorgraph.layers import Feature, Label, Weights, WeightedError, Dense, Dropout, WeightDecay, Reshape, SoftMaxCrossEntropy, L2Loss, ReduceSum
		from deepchem.models.tensorgraph.layers import Feature, Label, Weights, WeightedError, Dense, Dropout, WeightDecay, Reshape, SoftMax, SoftMaxCrossEntropy, L2Loss, ReduceSum

		logger = logging.getLogger(__name__)

		@@ -114,15 +114,16 @@ class MultiTaskClassifier(TensorGraph):

		# Compute the loss function for each label.

		output = Reshape(
		logits = Reshape(
		shape=(-1, n_tasks, n_classes),
		in_layers=[
		Dense(in_layers=[prev_layer], out_channels=n_tasks * n_classes)
		])
		output = SoftMax(logits)
		self.add_output(output)
		labels = Label(shape=(None, n_tasks, n_classes))
		weights = Weights(shape=(None, n_tasks))
		loss = SoftMaxCrossEntropy(in_layers=[labels, output])
		loss = SoftMaxCrossEntropy(in_layers=[labels, logits])
		weighted_loss = WeightedError(in_layers=[loss, weights])
		if weight_decay_penalty != 0.0:
		weighted_loss = WeightDecay(
		@@ -154,6 +155,19 @@ class MultiTaskClassifier(TensorGraph):
		feed_dict[self.task_weights[0]] = w_b
		yield feed_dict

		def create_estimator_inputs(self, feature_columns, weight_column, features,
		labels, mode):
		tensors = {}
		for layer, column in zip(self.features, feature_columns):
		tensors[layer] = tf.feature_column.input_layer(features, [column])
		if weight_column is not None:
		tensors[self.task_weights[0]] = tf.feature_column.input_layer(
		features, [weight_column])
		if labels is not None:
		tensors[self.labels[0]] = tf.one_hot(
		tf.cast(labels, tf.int32), self.n_classes)
		return tensors

		def predict_proba(self, dataset, transformers=[], outputs=None):
		return super(MultiTaskClassifier, self).predict(dataset, transformers,
		outputs)
		@@ -291,26 +305,6 @@ class MultiTaskRegressor(TensorGraph):
		in_layers=[weighted_loss])
		self.set_loss(weighted_loss)

		def default_generator(self,
		dataset,
		epochs=1,
		predict=False,
		deterministic=True,
		pad_batches=True):
		for epoch in range(epochs):
		for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
		batch_size=self.batch_size,
		deterministic=deterministic,
		pad_batches=pad_batches):
		feed_dict = dict()
		if y_b is not None and not predict:
		feed_dict[self.labels[0]] = y_b.reshape(-1, self.n_tasks, 1)
		if X_b is not None:
		feed_dict[self.features[0]] = X_b
		if w_b is not None and not predict:
		feed_dict[self.task_weights[0]] = w_b
		yield feed_dict


		class MultiTaskFitTransformRegressor(MultiTaskRegressor):
		"""Implements a MultiTaskRegressor that performs on-the-fly transformation during fit/predict.

deepchem/models/tensorgraph/layers.py

+16 −10

Original line number	Diff line number	Diff line
		@@ -152,9 +152,8 @@ class Layer(object):

		def set_summary(self, summary_op, summary_description=None, collections=None):
		"""Annotates a tensor with a tf.summary operation
		Collects data from self.out_tensor by default but can be changed by setting
		self.tb_input to another tensor in create_tensor

		This causes self.out_tensor to be logged to Tensorboard.

		Parameters
		----------
		@@ -175,21 +174,28 @@ class Layer(object):
		self.collections = collections
		self.tensorboard = True

		def add_summary_to_tg(self):
		def add_summary_to_tg(self, tb_input=None):
		"""
		Can only be called after self.create_layer to gaurentee that name is not none
		Create the summary operation for this layer, if set_summary() has been called on it.

		Can only be called after self.create_layer to guarantee that name is not None.

		Parameters
		----------
		tb_input: tensor
		the tensor to log to Tensorboard. If None, self.out_tensor is used.
		"""
		if self.tensorboard == False:
		return
		if self.tb_input == None:
		self.tb_input = self.out_tensor
		if tb_input == None:
		tb_input = self.out_tensor
		if self.summary_op == "tensor_summary":
		tf.summary.tensor_summary(self.name, self.tb_input,
		self.summary_description, self.collections)
		tf.summary.tensor_summary(self.name, tb_input, self.summary_description,
		self.collections)
		elif self.summary_op == 'scalar':
		tf.summary.scalar(self.name, self.tb_input, self.collections)
		tf.summary.scalar(self.name, tb_input, self.collections)
		elif self.summary_op == 'histogram':
		tf.summary.histogram(self.name, self.tb_input, self.collections)
		tf.summary.histogram(self.name, tb_input, self.collections)

		def copy(self, replacements={}, variables_graph=None, shared=False):
		"""Duplicate this Layer and all its inputs.

deepchem/models/tensorgraph/tensor_graph.py

+150 −0

Original line number	Diff line number	Diff line
		@@ -842,6 +842,146 @@ class TensorGraph(Model):
		feed_dict[self._training_placeholder] = train_value
		yield feed_dict

		def make_estimator(self,
		feature_columns,
		weight_column=None,
		model_dir=None,
		metrics={}):
		"""Construct a Tensorflow Estimator from this model.

		tf.estimator.Estimator is the standard Tensorflow API for representing models.
		This method provides interoperability between DeepChem and other Tensorflow
		based tools by allowing any model to be used an Estimator.

		Once this method returns, the Estimator it created is independent of the model
		it was created from. They do not share tensors, variables, save files, or any
		other resources. The Estimator is a self contained object with its own methods
		for training, evaluation, prediction, checkpointing, etc.

		Parameters
		----------
		feature_columns: list of tf.feature_column objects
		this describes the input features to the models. There must be one entry
		for each Feature layer in this model's features field.
		weight_column: tf.feature_column or None
		if this model includes a Weights layer, this describes the input weights.
		Otherwise, this should be None.
		metrics: map
		metrics that should be computed in calls to evaluate(). For each entry,
		the key is the name to report for the metric, and the value is a function
		of the form f(labels, predictions, weights) that returns the tensors for
		computing the metric. Any of the functions in tf.metrics can be used, as
		can other functions that satisfy the same interface.
		model_dir: str
		the directory in which the Estimator should save files. If None, this
		defaults to the model's model_dir.
		"""
		# Check the inputs.

		if len(feature_columns) != len(self.features):
		raise ValueError(
		'This model requires %d feature column(s)' % len(self.features))
		if len(self.labels) != 1:
		raise ValueError(
		'Can only create an Estimator from a model with exactly one Label input'
		)
		if len(self.task_weights) > 1:
		raise ValueError(
		'Cannot create an Estimator from a model with multiple Weight inputs')
		if weight_column is None:
		if len(self.task_weights) > 0:
		raise ValueError('This model requires a weight column')
		else:
		if len(self.task_weights) == 0:
		raise ValueError(
		'Cannot specify weight_column for a model with no Weight inputs')
		if model_dir is None:
		model_dir = self.model_dir

		# Define a function that recursively creates tensors from layers.

		def create_tensors(layer, tensors, training):
		if layer in tensors:
		return tensors[layer]
		inputs = [
		create_tensors(in_layer, tensors, training)
		for in_layer in layer.in_layers
		]
		tensor = layer.create_tensor(
		in_layers=inputs, set_tensors=False, training=training)
		tensors[layer] = tensor
		layer.add_summary_to_tg(tensor)
		return tensor

		# Define the model function.

		def model_fn(features, labels, mode):
		# Define the inputs.

		tensors = self.create_estimator_inputs(feature_columns, weight_column,
		features, labels, mode)
		for layer, tensor in tensors.items():
		layer.add_summary_to_tg(tensor)

		# Create the correct outputs, based on the mode.

		if mode == tf.estimator.ModeKeys.PREDICT:
		predictions = {}
		for i, output in enumerate(self.outputs):
		predictions[i] = create_tensors(output, tensors, 0)
		return tf.estimator.EstimatorSpec(mode, predictions=predictions)
		if mode == tf.estimator.ModeKeys.EVAL:
		loss = create_tensors(self.loss, tensors, 0)
		predictions = create_tensors(self.outputs[0], tensors, 0)
		if len(self.task_weights) == 0:
		weights = None
		else:
		weights = tensors[self.task_weights[0]]
		eval_metric_ops = {}
		for name, function in metrics.items():
		eval_metric_ops[name] = function(tensors[self.labels[0]], predictions,
		weights)
		return tf.estimator.EstimatorSpec(
		mode, loss=loss, eval_metric_ops=eval_metric_ops)
		if mode == tf.estimator.ModeKeys.TRAIN:
		loss = create_tensors(self.loss, tensors, 1)
		global_step = tf.train.get_global_step()
		optimizer = self.optimizer._create_optimizer(global_step)
		train_op = optimizer.minimize(loss, global_step=global_step)
		return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
		raise ValueError('Unknown mode')

		# Create the Estimator.

		return tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir)

		def create_estimator_inputs(self, feature_columns, weight_column, features,
		labels, mode):
		"""This is called by make_estimator() to create tensors for the inputs.

		feature_columns and weight_column are the arguments passed to
		make_estimator(). features, labels, and mode are the arguments passed to
		the estimator's model function. This method creates and returns a dict with
		one entry for every Feature, Label, or Weights layer in the graph. The keys
		are the layers, and the values are the tensors that correspond to them.

		Any subclass that overrides default_generator() must also override this
		method.
		"""
		if self.__class__.default_generator != TensorGraph.default_generator:
		raise ValueError(
		"Class overrides default_generator() but not create_estimator_inputs()"
		)
		tensors = {}
		for layer, column in zip(self.features, feature_columns):
		tensors[layer] = tf.feature_column.input_layer(features, [column])
		if weight_column is not None:
		tensors[self.task_weights[0]] = tf.feature_column.input_layer(
		features, [weight_column])
		if labels is not None:
		tensors[self.labels[0]] = tf.cast(labels, self.labels[0].dtype)
		return tensors


		def _enqueue_batch(tg, generator, graph, sess, n_enqueued, final_sample):
		"""
		@@ -865,6 +1005,16 @@ def _enqueue_batch(tg, generator, graph, sess, n_enqueued, final_sample):
		for layer in tg.features + tg.labels + tg.task_weights:
		if layer in feed_dict:
		value = feed_dict[layer]
		# Add or remove dimensions of size 1 to match the shape of the layer.
		value_dims = len(value.shape)
		layer_dims = len(layer.shape)
		if value_dims < layer_dims:
		if all(i == 1 for i in layer.shape[value_dims:]):
		value = value.reshape(
		list(value.shape) + [1] * (layer_dims - value_dims))
		if value_dims > layer_dims:
		if all(i == 1 for i in value.shape[layer_dims:]):
		value = value.reshape(value.shape[:layer_dims])
		else:
		value = np.zeros(
		[0] + list(layer.shape[1:]), dtype=layer.dtype.as_numpy_dtype)

deepchem/models/tensorgraph/tests/test_estimators.py

0 → 100644

+95 −0

Original line number	Diff line number	Diff line
		import unittest
		import numpy as np
		import tensorflow as tf
		import deepchem as dc
		from deepchem.data import NumpyDataset


		class TestEstimators(unittest.TestCase):
		"""
		Test converting TensorGraphs to Estimators.
		"""

		def test_multi_task_classifier(self):
		"""Test creating an Estimator from a MultiTaskClassifier."""
		n_samples = 10
		n_features = 3
		n_tasks = 2

		# Create a dataset and an input function for processing it.

		np.random.seed(123)
		X = np.random.rand(n_samples, n_features)
		y = np.zeros((n_samples, n_tasks))
		dataset = dc.data.NumpyDataset(X, y)

		def input_fn(epochs):
		x, y, weights = dataset.make_iterator(
		batch_size=n_samples, epochs=epochs).get_next()
		return {'x': x, 'weights': weights}, y

		# Create a TensorGraph model.

		model = dc.models.MultiTaskClassifier(n_tasks, n_features, dropouts=0)

		# Create an estimator from it.

		x_col = tf.feature_column.numeric_column('x', shape=(n_features,))
		weight_col = tf.feature_column.numeric_column('weights', shape=(n_tasks,))

		def accuracy(labels, predictions, weights):
		return tf.metrics.accuracy(labels, tf.round(predictions), weights)

		metrics = {'accuracy': accuracy}
		estimator = model.make_estimator(
		feature_columns=[x_col], weight_column=weight_col, metrics=metrics)

		# Train the model.

		estimator.train(input_fn=lambda: input_fn(100))

		# Evaluate the model.

		results = estimator.evaluate(input_fn=lambda: input_fn(1))
		assert results['loss'] < 1e-4
		assert results['accuracy'] > 0.9

		def test_multi_task_regressor(self):
		"""Test creating an Estimator from a MultiTaskRegressor."""
		n_samples = 10
		n_features = 3
		n_tasks = 2

		# Create a dataset and an input function for processing it.

		np.random.seed(123)
		X = np.random.rand(n_samples, n_features)
		y = np.zeros((n_samples, n_tasks))
		dataset = dc.data.NumpyDataset(X, y)

		def input_fn(epochs):
		x, y, weights = dataset.make_iterator(
		batch_size=n_samples, epochs=epochs).get_next()
		return {'x': x, 'weights': weights}, y

		# Create a TensorGraph model.

		model = dc.models.MultiTaskRegressor(n_tasks, n_features, dropouts=0)

		# Create an estimator from it.

		x_col = tf.feature_column.numeric_column('x', shape=(n_features,))
		weight_col = tf.feature_column.numeric_column('weights', shape=(n_tasks,))
		metrics = {'error': tf.metrics.mean_absolute_error}
		estimator = model.make_estimator(
		feature_columns=[x_col], weight_column=weight_col, metrics=metrics)

		# Train the model.

		estimator.train(input_fn=lambda: input_fn(100))

		# Evaluate the model.

		results = estimator.evaluate(input_fn=lambda: input_fn(1))
		assert results['loss'] < 1e-3
		assert results['error'] < 0.1

Admin message