Merge pull request #285 from miaecle/Benchmark2 (735df602) · Commits · 钟慕尧 / deepchem

README.md

+17 −6

Original line number	Diff line number	Diff line
		@@ -205,11 +205,22 @@ different subclasses of ``Featurizer`` for convenience:
		### Performances
		\|Dataset \|Model \|Train score/ROC-AUC\|Valid score/ROC-AUC\|Time(loading)/s \|Time(running)/s\|
		\|-----------\|--------------------\|-------------------\|-------------------\|----------------\|---------------\|
		\|tox21 \|tensorflow(MT-DNN) \|0.987 \|0.800 \|35 \|36 \|
		\|muv \|tensorflow(MT-DNN) \|0.979 \|0.660 \|414 \|255 \|
		\|pcba \|tensorflow(MT-DNN) \|0.949 \|0.791 \|1765 \|7209 \|
		\|sider \|tensorflow(MT-DNN) \|0.864 \|0.627 \|10 \|63 \|
		\|toxcast \|tensorflow(MT-DNN) \|0.944 \|0.697 \|75 \|2374 \|
		\|tox21 \|logistic regression \|0.910 \|0.759 \|30 \|30 \|
		\| \|tensorflow(MT-NN) \|0.987 \|0.800 \|30 \|30 \|
		\| \|graph convolution \|0.930 \|0.819 \|40 \|40 \|
		\|muv \|logistic regression \|0.910 \|0.744 \|600 \|800 \|
		\| \|tensorflow(MT-NN) \|0.980 \|0.710 \|600 \|800 \|
		\| \|graph convolution \|0.881 \|0.832 \|800 \|1200 \|
		\|pcba \|logistic regression \|0.759 \|0.736 \|1800 \|5400 \|
		\| \|tensorflow(MT-NN) \|0.949 \|0.791 \|1800 \|7200 \|
		\| \|graph convolution \|0.866 \|0.836 \|2200 \|20000 \|
		\|sider \|logistic regression \|0.900 \|0.620 \|15 \|40 \|
		\| \|tensorflow(MT-NN) \|0.931 \|0.647 \|15 \|60 \|
		\| \|graph convolution \|0.845 \|0.646 \|20 \|60 \|
		\|toxcast \|logistic regression \|0.762 \|0.622 \|80 \|2000 \|
		\| \|tensorflow(MT-NN) \|0.926 \|0.705 \|80 \|2400 \|
		\| \|graph convolution \|0.906 \|0.725 \|80 \|3000 \|


		## Contributing to DeepChem

deepchem/models/init.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -20,3 +20,4 @@ from deepchem.models.keras_models.fcnet import MultiTaskDNN
		from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskRegressor
		from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskClassifier
		from deepchem.models.tensorflow_models.robust_multitask import RobustMultitaskRegressor
		from deepchem.models.tensorflow_models.lr import TensorflowLogisticRegression
		No newline at end of file

deepchem/models/tensorflow_models/lr.py

0 → 100644

+231 −0

Original line number	Diff line number	Diff line
		# -- coding: utf-8 --
		"""
		Created on Tue Nov 08 14:10:02 2016

		@author: Zhenqin Wu
		"""
		import tensorflow as tf
		import numpy as np
		import os
		import time

		from deepchem.metrics import from_one_hot
		from deepchem.models.tensorflow_models import TensorflowGraph
		from deepchem.models.tensorflow_models import TensorflowGraphModel
		from deepchem.models.tensorflow_models import model_ops
		from deepchem.utils.save import log
		from deepchem.data import pad_features
		from deepchem.metrics import to_one_hot

		def weight_decay(penalty_type, penalty):
		# due to the different shape of weight(ndims=2) and bias(ndims=1),
		# will using this version for logreg
		variables = []
		# exclude bias variables
		for v in tf.trainable_variables():
		if v.get_shape().as_list()[0] > 1:
		variables.append(v)

		with tf.name_scope('weight_decay'):
		if penalty_type == 'l1':
		cost = tf.add_n([tf.reduce_sum(tf.abs(v)) for v in variables])
		elif penalty_type == 'l2':
		cost = tf.add_n([tf.nn.l2_loss(v) for v in variables])
		else:
		raise NotImplementedError('Unsupported penalty_type %s' % penalty_type)
		cost *= penalty
		tf.scalar_summary('Weight Decay Cost', cost)
		return cost


		class TensorflowLogisticRegression(TensorflowGraphModel):
		""" A simple tensorflow based logistic regression model. """
		def build(self, graph, name_scopes, training):
		"""Constructs the graph architecture of model: n_tasks * sigmoid nodes.

		This method creates the following Placeholders:
		mol_features: Molecule descriptor (e.g. fingerprint) tensor with shape
		batch_size x n_features.
		"""
		placeholder_scope = TensorflowGraph.get_placeholder_scope(
		graph, name_scopes)
		n_features = self.n_features
		with graph.as_default():
		with placeholder_scope:
		self.mol_features = tf.placeholder(
		tf.float32,
		shape=[None, n_features],
		name='mol_features')

		weight_init_stddevs = self.weight_init_stddevs
		bias_init_consts = self.bias_init_consts
		lg_list = []
		for task in range(self.n_tasks):
		#setting up n_tasks nodes(output nodes)
		lg = model_ops.fully_connected_layer(
		tensor=self.mol_features,
		size = 1,
		weight_init=tf.truncated_normal(
		shape=[self.n_features, 1],
		stddev=weight_init_stddevs[0]),
		bias_init=tf.constant(value=bias_init_consts[0],
		shape=[1]))
		lg_list.append(lg)

		return lg_list

		def add_label_placeholders(self, graph, name_scopes):
		#label placeholders with size batch_size * 1
		labels = []
		placeholder_scope = TensorflowGraph.get_placeholder_scope(graph, name_scopes)
		with placeholder_scope:
		for task in range(self.n_tasks):
		labels.append(tf.identity(
		tf.placeholder(tf.float32, shape=[None,1],
		name='labels_%d' % task)))
		return labels

		def add_training_cost(self, graph, name_scopes, output, labels, weights):
		with graph.as_default():
		epsilon = 1e-3 # small float to avoid dividing by zero
		weighted_costs = [] # weighted costs for each example
		gradient_costs = [] # costs used for gradient calculation

		with TensorflowGraph.shared_name_scope('costs', graph, name_scopes):
		for task in range(self.n_tasks):
		task_str = str(task).zfill(len(str(self.n_tasks)))
		with TensorflowGraph.shared_name_scope(
		'cost_{}'.format(task_str), graph, name_scopes):
		with tf.name_scope('weighted'):
		weighted_cost = self.cost(output[task], labels[task],
		weights[task])
		weighted_costs.append(weighted_cost)

		with tf.name_scope('gradient'):
		# Note that we divide by the batch size and not the number of
		# non-zero weight examples in the batch. Also, instead of using
		# tf.reduce_mean (which can put ops on the CPU) we explicitly
		# calculate with div/sum so it stays on the GPU.
		gradient_cost = tf.div(tf.reduce_sum(weighted_cost),
		self.batch_size)
		gradient_costs.append(gradient_cost)

		# aggregated costs
		with TensorflowGraph.shared_name_scope('aggregated', graph, name_scopes):
		with tf.name_scope('gradient'):
		loss = tf.add_n(gradient_costs)

		# weight decay
		if self.penalty != 0.0:
		# using self-defined regularization
		penalty = weight_decay(self.penalty_type, self.penalty)
		loss += penalty

		return loss

		def cost(self, logits, labels, weights):
		return tf.mul(tf.nn.sigmoid_cross_entropy_with_logits(logits, labels),
		weights)

		def add_output_ops(self, graph, output):
		# adding output nodes of sigmoid function
		with graph.as_default():
		sigmoid = []
		with tf.name_scope('inference'):
		for i, logits in enumerate(output):
		sigmoid.append(tf.nn.sigmoid(logits, name='sigmoid_%d' % i))
		output = sigmoid
		return output

		def construct_feed_dict(self, X_b, y_b=None, w_b=None, ids_b=None):

		orig_dict = {}
		orig_dict["mol_features"] = X_b
		for task in range(self.n_tasks):
		if y_b is not None:
		y_2column = to_one_hot(y_b[:, task])
		# fix the size to be [?,1]
		orig_dict["labels_%d" % task] = y_2column[:,1:2]
		else:
		# Dummy placeholders
		orig_dict["labels_%d" % task] = np.zeros((self.batch_size,1))
		if w_b is not None:
		orig_dict["weights_%d" % task] = w_b[:, task]
		else:
		# Dummy placeholders
		orig_dict["weights_%d" % task] = np.ones(
		(self.batch_size,))
		return TensorflowGraph.get_feed_dict(orig_dict)

		def predict_proba_on_batch(self, X, pad_batch=False):
		if pad_batch:
		X = pad_features(self.batch_size, X)
		if not self._restored_model:
		self.restore()
		with self.eval_graph.graph.as_default():
		# run eval data through the model
		n_tasks = self.n_tasks
		with self._get_shared_session(train=False).as_default():
		feed_dict = self.construct_feed_dict(X)
		data = self._get_shared_session(train=False).run(
		self.eval_graph.output, feed_dict=feed_dict)
		batch_outputs = np.asarray(data[:n_tasks], dtype=float)
		# transfer 2D prediction tensor to 2D x n_classes(=2)
		complimentary = np.ones(np.shape(batch_outputs))
		complimentary = complimentary - batch_outputs
		batch_outputs = np.squeeze(np.stack(arrays = [complimentary,
		batch_outputs],
		axis = 2))
		# reshape to batch_size x n_tasks x ...
		if batch_outputs.ndim == 3:
		batch_outputs = batch_outputs.transpose((1, 0, 2))
		elif batch_outputs.ndim == 2:
		batch_outputs = batch_outputs.transpose((1, 0))
		else:
		raise ValueError(
		'Unrecognized rank combination for output: %s ' %
		(batch_outputs.shape,))

		outputs = batch_outputs

		return np.copy(outputs)

		def predict_on_batch(self, X, pad_batch=False):

		if pad_batch:
		X = pad_features(self.batch_size, X)

		if not self._restored_model:
		self.restore()
		with self.eval_graph.graph.as_default():

		# run eval data through the model
		n_tasks = self.n_tasks
		output = []
		start = time.time()
		with self._get_shared_session(train=False).as_default():
		feed_dict = self.construct_feed_dict(X)
		data = self._get_shared_session(train=False).run(
		self.eval_graph.output, feed_dict=feed_dict)
		batch_output = np.asarray(data[:n_tasks], dtype=float)
		# transfer 2D prediction tensor to 2D x n_classes(=2)
		complimentary = np.ones(np.shape(batch_output))
		complimentary = complimentary - batch_output
		batch_output = np.squeeze(np.stack(arrays = [complimentary,
		batch_output],
		axis = 2))
		# reshape to batch_size x n_tasks x ...
		if batch_output.ndim == 3:
		batch_output = batch_output.transpose((1, 0, 2))
		elif batch_output.ndim == 2:
		batch_output = batch_output.transpose((1, 0))
		else:
		raise ValueError(
		'Unrecognized rank combination for output: %s' %
		(batch_output.shape,))
		output.append(batch_output)

		outputs = np.array(from_one_hot(
		np.squeeze(np.concatenate(output)), axis=-1))

		return np.copy(outputs)

deepchem/models/tensorflow_models/model_ops.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -181,7 +181,7 @@ def weight_decay(penalty_type, penalty):

		with tf.name_scope('weight_decay'):
		if penalty_type == 'l1':
		cost = tf.add_n([tf.reduce_sum(tf.Abs(v)) for v in variables])
		cost = tf.add_n([tf.reduce_sum(tf.abs(v)) for v in variables])
		elif penalty_type == 'l2':
		cost = tf.add_n([tf.nn.l2_loss(v) for v in variables])
		else:

deepchem/models/tests/test_overfit.py

+32 −0

Original line number	Diff line number	Diff line
		@@ -463,6 +463,38 @@ class TestOverfit(test_util.TensorFlowTestCase):
		scores = model.evaluate(dataset, [classification_metric])
		assert scores[classification_metric.name] > .9


		def test_tf_logreg_multitask_classification_overfit(self):
		"""Test tf multitask overfits tiny data."""
		n_tasks = 10
		n_samples = 10
		n_features = 3
		n_classes = 2

		# Generate dummy dataset
		np.random.seed(123)
		ids = np.arange(n_samples)
		X = np.random.rand(n_samples, n_features)
		y = np.zeros((n_samples, n_tasks))
		w = np.ones((n_samples, n_tasks))
		dataset = dc.data.NumpyDataset(X, y, w, ids)

		verbosity = "high"
		classification_metric = dc.metrics.Metric(
		dc.metrics.accuracy_score, verbosity=verbosity, task_averager=np.mean)
		tensorflow_model = dc.models.TensorflowLogisticRegression(
		n_tasks, n_features, learning_rate=0.5, weight_init_stddevs=[.01],
		batch_size=n_samples, verbosity=verbosity)
		model = dc.models.TensorflowModel(tensorflow_model)

		# Fit trained model
		model.fit(dataset)
		model.save()

		# Eval model on train
		scores = model.evaluate(dataset, [classification_metric])
		assert scores[classification_metric.name] > .9

		def test_sklearn_multitask_regression_overfit(self):
		"""Test SKLearn singletask-to-multitask overfits tiny regression data."""
		n_tasks = 2

Original line number	Diff line number	Diff line
		@@ -205,11 +205,22 @@ different subclasses of ``Featurizer`` for convenience:
		### Performances
		\|Dataset \|Model \|Train score/ROC-AUC\|Valid score/ROC-AUC\|Time(loading)/s \|Time(running)/s\|
		\|-----------\|--------------------\|-------------------\|-------------------\|----------------\|---------------\|
		\|tox21 \|tensorflow(MT-DNN) \|0.987 \|0.800 \|35 \|36 \|
		\|muv \|tensorflow(MT-DNN) \|0.979 \|0.660 \|414 \|255 \|
		\|pcba \|tensorflow(MT-DNN) \|0.949 \|0.791 \|1765 \|7209 \|
		\|sider \|tensorflow(MT-DNN) \|0.864 \|0.627 \|10 \|63 \|
		\|toxcast \|tensorflow(MT-DNN) \|0.944 \|0.697 \|75 \|2374 \|
		\|tox21 \|logistic regression \|0.910 \|0.759 \|30 \|30 \|
		\| \|tensorflow(MT-NN) \|0.987 \|0.800 \|30 \|30 \|
		\| \|graph convolution \|0.930 \|0.819 \|40 \|40 \|
		\|muv \|logistic regression \|0.910 \|0.744 \|600 \|800 \|
		\| \|tensorflow(MT-NN) \|0.980 \|0.710 \|600 \|800 \|
		\| \|graph convolution \|0.881 \|0.832 \|800 \|1200 \|
		\|pcba \|logistic regression \|0.759 \|0.736 \|1800 \|5400 \|
		\| \|tensorflow(MT-NN) \|0.949 \|0.791 \|1800 \|7200 \|
		\| \|graph convolution \|0.866 \|0.836 \|2200 \|20000 \|
		\|sider \|logistic regression \|0.900 \|0.620 \|15 \|40 \|
		\| \|tensorflow(MT-NN) \|0.931 \|0.647 \|15 \|60 \|
		\| \|graph convolution \|0.845 \|0.646 \|20 \|60 \|
		\|toxcast \|logistic regression \|0.762 \|0.622 \|80 \|2000 \|
		\| \|tensorflow(MT-NN) \|0.926 \|0.705 \|80 \|2400 \|
		\| \|graph convolution \|0.906 \|0.725 \|80 \|3000 \|


		## Contributing to DeepChem

Admin message