Cleanup (557bfba0) · Commits · 钟慕尧 / deepchem

deepchem/models/init.py

+0 −5

Original line number	Diff line number	Diff line
		@@ -8,12 +8,7 @@ from __future__ import unicode_literals
		from deepchem.models.models import Model
		from deepchem.models.sklearn_models import SklearnModel
		from deepchem.models.xgboost_models import XGBoostModel
		from deepchem.models.tf_new_models.multitask_classifier import MultitaskGraphClassifier
		from deepchem.models.tf_new_models.multitask_regressor import MultitaskGraphRegressor, DTNNMultitaskGraphRegressor

		from deepchem.models.tf_new_models.support_classifier import SupportGraphClassifier
		from deepchem.models.multitask import SingletaskToMultitask
		from deepchem.models.sequential import Sequential

		from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskRegressor
		from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskClassifier

deepchem/models/sequential.py

+0 −321

Original line number	Diff line number	Diff line
		@@ -18,324 +18,3 @@ import tempfile
		import numpy as np
		import tensorflow as tf
		from deepchem.models.models import Model
		from deepchem.nn import model_ops
		from deepchem.nn.copy import Layer
		from deepchem.nn.copy import InputLayer


		class Sequential(Model):
		"""Linear stack of layers.

		Parameters
		----------
		layers: list of layers to add to the model.

		Note
		----
		The first layer passed to a Sequential model
		should have a defined input shape. What that
		means is that it should have received an `input_shape`
		or `batch_input_shape` argument,
		or for some type of layers (recurrent, Dense...)
		an `input_dim` argument.

		Example
		-------
		>>> import deepchem as dc
		>>> model = dc.models.Sequential()
		>>> # Add features
		>>> model.add_features(dc.nn.Input(shape=(50,)))
		>>> # Add labels
		>>> model.add_labels(dc.nn.Input(shape=(1,)))
		>>> model.add(dc.nn.Dense(32, 50))
		>>> model.add(dc.nn.Dense(64, 32))
		"""

		def __init__(self, name=None, logdir=None):
		super(Sequential, self).__init__(self, model_dir=logdir)
		self.layers = [] # stack of layers
		self.outputs = None # tensors (length 1)

		if not name:
		prefix = 'sequential_'
		name = prefix + str(model_ops.get_uid(prefix))
		self.name = name
		self.graph = tf.Graph()

		config = tf.ConfigProto(allow_soft_placement=True)
		self.session = tf.Session(graph=self.graph, config=config)
		# Path to save checkpoint files
		self._save_path = os.path.join(self.model_dir, 'model.ckpt')

		def add(self, layer):
		"""Adds a layer instance on top of the layer stack.

		Parameters
		----------
		layer: layer instance.
		"""
		if not isinstance(layer, Layer):
		raise TypeError("The added layer must be an instance of class Layer. "
		"Found: " + str(layer))
		with self.graph.as_default():
		if not self.layers:
		raise ValueError("Call add_features() before calling add()")
		# first layer in model: check that it is an input layer

		else:
		self.outputs = layer(self.outputs)

		self.layers.append(layer)

		def add_features(self, layer):
		"""Adds an input layer."""
		if self.layers:
		raise ValueError(
		"add_features() has to be called before layers are added.")
		if not isinstance(layer, InputLayer):
		raise ValueError("First layer in sequential model must be InputLayer")
		with self.graph.as_default():
		self.features = layer()[0]
		self.outputs = self.features
		self.layers = [layer]

		def add_labels(self, layer):
		"""Adds a layer for labels"""
		with self.graph.as_default():
		self.labels = layer()[0]

		def add_loss(self, loss, inputs=None):
		"""Adds a loss to model.

		Parameters
		----------
		losses: list
		"""
		# Add losses to graph
		with self.graph.as_default():
		# Loss for each batch element
		batch_loss = loss(self.outputs, self.labels)
		# Loss should be a float
		self.loss = tf.reduce_sum(batch_loss)

		@property
		def uses_learning_phase(self):
		return self.uses_learning_phase

		def fit(self,
		dataset,
		nb_epoch=10,
		max_checkpoints_to_keep=5,
		log_every_N_batches=50,
		learning_rate=.001,
		batch_size=50,
		checkpoint_interval=10):
		"""Trains the model for a fixed number of epochs.

		TODO(rbharath0: This is mostly copied from TensorflowGraphModel. Should
		eventually refactor both together.

		Parameters
		----------
		dataset: dc.data.Dataset
		nb_epoch: 10
		Number of training epochs.
		Dataset object holding training data
		batch_size: integer. Number of samples per gradient update.
		nb_epoch: integer, the number of epochs to train the model.
		verbose: 0 for no logging to stdout,
		1 for progress bar logging, 2 for one log line per epoch.
		initial_epoch: epoch at which to start training
		(useful for resuming a previous training run)
		checkpoint_interval: int
		Frequency at which to write checkpoints, measured in epochs
		"""
		############################################################## TIMING
		time1 = time.time()
		############################################################## TIMING
		print("Training for %d epochs" % nb_epoch)
		with self.graph.as_default():
		opt = model_ops.optimizer("adam", learning_rate)
		train_op = opt.minimize(self.loss, name='train')
		with self.session as sess:
		sess.run(tf.global_variables_initializer())
		saver = tf.train.Saver(max_to_keep=max_checkpoints_to_keep)
		# Save an initial checkpoint.
		saver.save(sess, self._save_path, global_step=0)
		for epoch in range(nb_epoch):
		avg_loss, n_batches = 0., 0
		# TODO(rbharath): Don't support example weighting yet.
		for ind, (X_b, y_b, w_b,
		ids_b) in enumerate(dataset.iterbatches(batch_size)):
		if ind % log_every_N_batches == 0:
		print("On batch %d" % ind)
		feed_dict = {self.features: X_b, self.labels: y_b}
		fetches = [self.outputs] + [train_op, self.loss]
		fetched_values = sess.run(fetches, feed_dict=feed_dict)
		output = fetched_values[:1]
		loss = fetched_values[-1]
		avg_loss += loss
		y_pred = np.squeeze(np.array(output))
		y_b = y_b.flatten()
		n_batches += 1
		if epoch % checkpoint_interval == checkpoint_interval - 1:
		saver.save(sess, self._save_path, global_step=epoch)
		avg_loss = float(avg_loss) / n_batches
		print('Ending epoch %d: Average loss %g' % (epoch, avg_loss))
		# Always save a final checkpoint when complete.
		saver.save(sess, self._save_path, global_step=epoch + 1)
		############################################################## TIMING
		time2 = time.time()
		print("TIMING: model fitting took %0.3f s" % (time2 - time1))
		############################################################## TIMING

		def evaluate(self,
		x,
		y,
		batch_size=32,
		verbose=1,
		sample_weight=None,
		**kwargs):
		"""Computes the loss on some input data, batch by batch.

		Parameters
		----------
		x: input data, as a Numpy array or list of Numpy arrays
		(if the model has multiple inputs).
		y: labels, as a Numpy array.
		batch_size: integer. Number of samples per gradient update.
		verbose: verbosity mode, 0 or 1.
		sample_weight: sample weights, as a Numpy array.

		Returns
		-------
		Scalar test loss (if the model has no metrics)
		or list of scalars (if the model computes other metrics).
		The attribute `model.metrics_names` will give you
		the display labels for the scalar outputs.
		"""
		if self.model is None:
		raise RuntimeError('The model needs to be compiled ' 'before being used.')
		if 'show_accuracy' in kwargs:
		kwargs.pop('show_accuracy')
		warnings.warn('The "show_accuracy" argument is deprecated, '
		'instead you should pass the "accuracy" metric to '
		'the model at compile time:\n'
		'`model.compile(optimizer, loss, '
		'metrics=["accuracy"])`')
		if kwargs:
		raise TypeError('Received unknown keyword arguments: ' + str(kwargs))
		return self.model.evaluate(
		x,
		y,
		batch_size=batch_size,
		verbose=verbose,
		sample_weight=sample_weight)

		def predict(self, x, batch_size=32, verbose=0):
		"""Generates output predictions for the input samples,
		processing the samples in a batched way.

		# Arguments
		x: the input data, as a Numpy array.
		batch_size: integer.
		verbose: verbosity mode, 0 or 1.

		# Returns
		A Numpy array of predictions.
		"""
		if self.model is None:
		self.build()
		return self.model.predict(x, batch_size=batch_size, verbose=verbose)

		def predict_on_batch(self, x):
		"""Returns predictions for a single batch of samples.
		"""
		if self.model is None:
		self.build()
		return self.model.predict_on_batch(x)

		def train_on_batch(self,
		x,
		y,
		class_weight=None,
		sample_weight=None,
		**kwargs):
		"""Single gradient update over one batch of samples.

		# Arguments
		x: input data, as a Numpy array or list of Numpy arrays
		(if the model has multiple inputs).
		y: labels, as a Numpy array.
		class_weight: dictionary mapping classes to a weight value,
		used for scaling the loss function (during training only).
		sample_weight: sample weights, as a Numpy array.

		# Returns
		Scalar training loss (if the model has no metrics)
		or list of scalars (if the model computes other metrics).
		The attribute `model.metrics_names` will give you
		the display labels for the scalar outputs.
		"""
		if self.model is None:
		raise RuntimeError('The model needs to be compiled ' 'before being used.')
		if 'accuracy' in kwargs:
		kwargs.pop('accuracy')
		warnings.warn('The "accuracy" argument is deprecated, '
		'instead you should pass the "accuracy" metric to '
		'the model at compile time:\n'
		'`model.compile(optimizer, loss, '
		'metrics=["accuracy"])`')
		if kwargs:
		raise TypeError('Received unknown keyword arguments: ' + str(kwargs))
		return self.model.train_on_batch(
		x, y, sample_weight=sample_weight, class_weight=class_weight)

		def test_on_batch(self, x, y, sample_weight=None, **kwargs):
		"""Evaluates the model over a single batch of samples.

		# Arguments
		x: input data, as a Numpy array or list of Numpy arrays
		(if the model has multiple inputs).
		y: labels, as a Numpy array.
		sample_weight: sample weights, as a Numpy array.

		# Returns
		Scalar test loss (if the model has no metrics)
		or list of scalars (if the model computes other metrics).
		The attribute `model.metrics_names` will give you
		the display labels for the scalar outputs.
		"""
		if self.model is None:
		raise RuntimeError('The model needs to be compiled ' 'before being used.')
		if 'accuracy' in kwargs:
		kwargs.pop('accuracy')
		warnings.warn('The "accuracy" argument is deprecated, '
		'instead you should pass the "accuracy" metric to '
		'the model at compile time:\n'
		'`model.compile(optimizer, loss, '
		'metrics=["accuracy"])`')
		if kwargs:
		raise TypeError('Received unknown keyword arguments: ' + str(kwargs))
		return self.model.test_on_batch(x, y, sample_weight=sample_weight)

		def predict_proba(self, x, batch_size=32, verbose=1):
		"""Generates class probability predictions for the input samples
		batch by batch.

		# Arguments
		x: input data, as a Numpy array or list of Numpy arrays
		(if the model has multiple inputs).
		batch_size: integer.
		verbose: verbosity mode, 0 or 1.

		# Returns
		A Numpy array of probability predictions.
		"""
		preds = self.predict(x, batch_size, verbose)
		if preds.min() < 0. or preds.max() > 1.:
		warnings.warn('Network returning invalid probability values. '
		'The last layer might not normalize predictions '
		'into probabilities '
		'(like softmax or sigmoid would).')
		return preds

deepchem/models/tf_new_models/multitask_classifier.py

+0 −265

Original line number	Diff line number	Diff line
		@@ -19,268 +19,3 @@ import tempfile
		from deepchem.data import pad_features
		from deepchem.utils.save import log
		from deepchem.models import Model
		from deepchem.nn.copy import Input
		from deepchem.nn.copy import Dense
		from deepchem.nn import model_ops
		# TODO(rbharath): Find a way to get rid of this import?
		from deepchem.models.tf_new_models.graph_topology import merge_dicts


		def get_loss_fn(final_loss):
		# Obtain appropriate loss function
		if final_loss == 'L2':

		def loss_fn(x, t):
		diff = tf.subtract(x, t)
		return tf.reduce_sum(tf.square(diff), 0)
		elif final_loss == 'weighted_L2':

		def loss_fn(x, t, w):
		diff = tf.subtract(x, t)
		weighted_diff = tf.multiply(diff, w)
		return tf.reduce_sum(tf.square(weighted_diff), 0)
		elif final_loss == 'L1':

		def loss_fn(x, t):
		diff = tf.subtract(x, t)
		return tf.reduce_sum(tf.abs(diff), 0)
		elif final_loss == 'huber':

		def loss_fn(x, t):
		diff = tf.subtract(x, t)
		return tf.reduce_sum(
		tf.minimum(0.5 * tf.square(diff),
		huber_d * (tf.abs(diff) - 0.5 * huber_d)), 0)
		elif final_loss == 'cross_entropy':

		def loss_fn(x, t, w):
		costs = tf.nn.sigmoid_cross_entropy_with_logits(logits=x, labels=t)
		weighted_costs = tf.multiply(costs, w)
		return tf.reduce_sum(weighted_costs)
		elif final_loss == 'hinge':

		def loss_fn(x, t, w):
		t = tf.multiply(2.0, t) - 1
		costs = tf.maximum(0.0, 1.0 - tf.multiply(t, x))
		weighted_costs = tf.multiply(costs, w)
		return tf.reduce_sum(weighted_costs)

		return loss_fn


		class MultitaskGraphClassifier(Model):

		def __init__(self,
		model,
		n_tasks,
		n_feat,
		logdir=None,
		batch_size=50,
		final_loss='cross_entropy',
		learning_rate=.001,
		optimizer_type="adam",
		learning_rate_decay_time=1000,
		beta1=.9,
		beta2=.999,
		pad_batches=True,
		verbose=True):

		warnings.warn("MultitaskGraphClassifier is deprecated. "
		"Will be removed in DeepChem 1.4.", DeprecationWarning)
		super(MultitaskGraphClassifier, self).__init__(
		model_dir=logdir, verbose=verbose)
		self.n_tasks = n_tasks
		self.final_loss = final_loss
		self.model = model
		self.sess = tf.Session(graph=self.model.graph)

		with self.model.graph.as_default():
		# Extract model info
		self.batch_size = batch_size
		self.pad_batches = pad_batches
		# Get graph topology for x
		self.graph_topology = self.model.get_graph_topology()
		self.feat_dim = n_feat

		# Raw logit outputs
		self.logits = self.build()
		self.loss_op = self.add_training_loss(self.final_loss, self.logits)
		self.outputs = self.add_softmax(self.logits)

		self.learning_rate = learning_rate
		self.T = learning_rate_decay_time
		self.optimizer_type = optimizer_type

		self.optimizer_beta1 = beta1
		self.optimizer_beta2 = beta2

		# Set epsilon
		self.epsilon = 1e-7
		self.add_optimizer()

		# Initialize
		self.init_fn = tf.global_variables_initializer()
		self.sess.run(self.init_fn)

		# Path to save checkpoint files, which matches the
		# replicated supervisor's default path.
		self._save_path = os.path.join(self.model_dir, 'model.ckpt')

		def build(self):
		# Create target inputs
		self.label_placeholder = tf.placeholder(
		dtype='bool', shape=(None, self.n_tasks), name="label_placeholder")
		self.weight_placeholder = tf.placeholder(
		dtype='float32', shape=(None, self.n_tasks), name="weight_placholder")

		feat = self.model.return_outputs()
		################################################################ DEBUG
		#print("multitask classifier")
		#print("feat")
		#print(feat)
		################################################################ DEBUG
		output = model_ops.multitask_logits(feat, self.n_tasks)
		return output

		def add_optimizer(self):
		if self.optimizer_type == "adam":
		self.optimizer = tf.train.AdamOptimizer(
		self.learning_rate,
		beta1=self.optimizer_beta1,
		beta2=self.optimizer_beta2,
		epsilon=self.epsilon)
		else:
		raise ValueError("Optimizer type not recognized.")

		# Get train function
		self.train_op = self.optimizer.minimize(self.loss_op)

		def construct_feed_dict(self, X_b, y_b=None, w_b=None, training=True):
		"""Get initial information about task normalization"""
		# TODO(rbharath): I believe this is total amount of data
		n_samples = len(X_b)
		if y_b is None:
		y_b = np.zeros((n_samples, self.n_tasks))
		if w_b is None:
		w_b = np.zeros((n_samples, self.n_tasks))
		targets_dict = {self.label_placeholder: y_b, self.weight_placeholder: w_b}

		# Get graph information
		atoms_dict = self.graph_topology.batch_to_feed_dict(X_b)

		# TODO (hraut->rhbarath): num_datapoints should be a vector, with ith element being
		# the number of labeled data points in target_i. This is to normalize each task
		# num_dat_dict = {self.num_datapoints_placeholder : self.}

		# Get other optimizer information
		# TODO(rbharath): Figure out how to handle phase appropriately
		feed_dict = merge_dicts([targets_dict, atoms_dict])
		return feed_dict

		def add_training_loss(self, final_loss, logits):
		"""Computes loss using logits."""
		loss_fn = get_loss_fn(final_loss) # Get loss function
		task_losses = []
		# label_placeholder of shape (batch_size, n_tasks). Split into n_tasks
		# tensors of shape (batch_size,)
		task_labels = tf.split(
		axis=1, num_or_size_splits=self.n_tasks, value=self.label_placeholder)
		task_weights = tf.split(
		axis=1, num_or_size_splits=self.n_tasks, value=self.weight_placeholder)
		for task in range(self.n_tasks):
		task_label_vector = task_labels[task]
		task_weight_vector = task_weights[task]
		# Convert the labels into one-hot vector encodings.
		one_hot_labels = tf.to_float(
		tf.one_hot(tf.to_int32(tf.squeeze(task_label_vector)), 2))
		# Since we use tf.nn.softmax_cross_entropy_with_logits note that we pass in
		# un-softmaxed logits rather than softmax outputs.
		task_loss = loss_fn(logits[task], one_hot_labels, task_weight_vector)
		task_losses.append(task_loss)
		# It's ok to divide by just the batch_size rather than the number of nonzero
		# examples (effect averages out)
		total_loss = tf.add_n(task_losses)
		total_loss = tf.div(total_loss, self.batch_size)
		return total_loss

		def add_softmax(self, outputs):
		"""Replace logits with softmax outputs."""
		softmax = []
		with tf.name_scope('inference'):
		for i, logits in enumerate(outputs):
		softmax.append(tf.nn.softmax(logits, name='softmax_%d' % i))
		return softmax

		def fit(self,
		dataset,
		nb_epoch=10,
		max_checkpoints_to_keep=5,
		log_every_N_batches=50,
		checkpoint_interval=10,
		**kwargs):
		# Perform the optimization
		log("Training for %d epochs" % nb_epoch, self.verbose)

		# TODO(rbharath): Disabling saving for now to try to debug.
		for epoch in range(nb_epoch):
		log("Starting epoch %d" % epoch, self.verbose)
		for batch_num, (X_b, y_b, w_b, ids_b) in enumerate(
		dataset.iterbatches(self.batch_size, pad_batches=self.pad_batches)):
		if batch_num % log_every_N_batches == 0:
		log("On batch %d" % batch_num, self.verbose)
		self.sess.run(
		self.train_op, feed_dict=self.construct_feed_dict(X_b, y_b, w_b))

		def save(self):
		"""
		No-op since this model doesn't currently support saving...
		"""
		pass

		def predict(self, dataset, transformers=[], **kwargs):
		"""Wraps predict to set batch_size/padding."""
		return super(MultitaskGraphClassifier, self).predict(
		dataset, transformers, batch_size=self.batch_size)

		def predict_proba(self, dataset, transformers=[], n_classes=2, **kwargs):
		"""Wraps predict_proba to set batch_size/padding."""
		return super(MultitaskGraphClassifier, self).predict_proba(
		dataset, transformers, n_classes=n_classes, batch_size=self.batch_size)

		def predict_on_batch(self, X):
		"""Return model output for the provided input.
		"""
		if self.pad_batches:
		X = pad_features(self.batch_size, X)
		# run eval data through the model
		n_tasks = self.n_tasks
		with self.sess.as_default():
		feed_dict = self.construct_feed_dict(X)
		# Shape (n_samples, n_tasks)
		batch_outputs = self.sess.run(self.outputs, feed_dict=feed_dict)

		n_samples = len(X)
		outputs = np.zeros((n_samples, self.n_tasks))
		for task, output in enumerate(batch_outputs):
		outputs[:, task] = np.argmax(output, axis=1)
		return outputs

		def predict_proba_on_batch(self, X, n_classes=2):
		"""Returns class probabilities on batch"""
		# run eval data through the model
		if self.pad_batches:
		X = pad_features(self.batch_size, X)
		n_tasks = self.n_tasks
		with self.sess.as_default():
		feed_dict = self.construct_feed_dict(X)
		batch_outputs = self.sess.run(self.outputs, feed_dict=feed_dict)

		n_samples = len(X)
		outputs = np.zeros((n_samples, self.n_tasks, n_classes))
		for task, output in enumerate(batch_outputs):
		outputs[:, task, :] = output
		return outputs

		def get_num_tasks(self):
		"""Needed to use Model.predict() from superclass."""
		return self.n_tasks

deepchem/models/tf_new_models/multitask_regressor.py

+0 −215

File changed.

Preview size limit exceeded, changes collapsed.

deepchem/models/tf_new_models/support_classifier.py

+0 −365

File changed.

Preview size limit exceeded, changes collapsed.

Admin message