Merge pull request #1265 from peastman/metric (4b8bc381) · Commits · 钟慕尧 / deepchem

deepchem/metrics/init.py

+41 −52

Original line number	Diff line number	Diff line
		@@ -3,10 +3,9 @@
		import numpy as np
		import warnings
		from deepchem.utils.save import log
		from sklearn.metrics import roc_auc_score
		import sklearn.metrics
		from sklearn.metrics import matthews_corrcoef
		from sklearn.metrics import recall_score
		from sklearn.metrics import accuracy_score
		from sklearn.metrics import r2_score
		from sklearn.metrics import mean_squared_error
		from sklearn.metrics import mean_absolute_error
		@@ -39,22 +38,29 @@ def from_one_hot(y, axis=1):
		return np.argmax(y, axis=axis)


		def compute_roc_auc_scores(y, y_pred):
		"""Transforms the results dict into roc-auc-scores and prints scores.
		def _ensure_one_hot(y):
		"""If neceessary, convert class labels to one-hot encoding."""
		if len(y.shape) == 1:
		return to_one_hot(y)
		return y

		Parameters
		----------
		results: dict
		task_types: dict
		dict mapping task names to output type. Each output type must be either
		"classification" or "regression".
		"""
		try:
		score = roc_auc_score(y, y_pred)
		except ValueError:
		warnings.warn("ROC AUC score calculation failed.")
		score = 0.5
		return score

		def _ensure_class_labels(y):
		"""If necessary, convert one-hot encoding to class labels."""
		if len(y.shape) == 2:
		return from_one_hot(y)
		return y


		def roc_auc_score(y, y_pred):
		"""Area under the receiver operating characteristic curve."""
		return sklearn.metrics.roc_auc_score(_ensure_one_hot(y), y_pred)


		def accuracy_score(y, y_pred):
		y = _ensure_class_labels(y)
		y_pred = _ensure_class_labels(y_pred)
		return sklearn.metrics.accuracy_score(y, y_pred)


		def balanced_accuracy_score(y, y_pred):
		@@ -74,6 +80,7 @@ def pearson_r2_score(y, y_pred):

		def prc_auc_score(y, y_pred):
		"""Compute area under precision-recall curve"""
		y = _ensure_one_hot(y)
		assert y_pred.shape == y.shape
		assert y_pred.shape[1] == 2
		precision, recall, _ = precision_recall_curve(y[:, 1], y_pred[:, 1])
		@@ -112,13 +119,14 @@ def kappa_score(y_true, y_pred):
		yt = np.asarray(y_true, dtype=int)
		yp = np.asarray(y_pred, dtype=int)
		assert np.array_equal(
		np.unique(yt), [0,
		1]), ('Class labels must be binary: %s' % np.unique(yt))
		np.unique(yt),
		[0, 1]), ('Class labels must be binary: %s' % np.unique(yt))
		observed_agreement = np.true_divide(
		np.count_nonzero(np.equal(yt, yp)), len(yt))
		expected_agreement = np.true_divide(
		np.count_nonzero(yt == 1) * np.count_nonzero(yp == 1) +
		np.count_nonzero(yt == 0) * np.count_nonzero(yp == 0), len(yt)**2)
		np.count_nonzero(yt == 0) * np.count_nonzero(yp == 0),
		len(yt)**2)
		kappa = np.true_divide(observed_agreement - expected_agreement,
		1.0 - expected_agreement)
		return kappa
		@@ -205,25 +213,20 @@ class Metric(object):
		-------
		A numpy nd.array containing metric values for each task.
		"""
		if len(y_true.shape) > 1:
		n_samples, n_tasks = y_true.shape[0], y_true.shape[1]
		else:
		n_samples, n_tasks = y_true.shape[0], 1
		if self.mode == "classification":
		y_pred = np.reshape(y_pred, (n_samples, n_tasks, n_classes))
		n_samples = y_true.shape[0]
		expected_dims = (3 if self.mode == "classification" else 2)
		if len(y_pred.shape) < expected_dims:
		n_tasks = 1
		y_true = np.expand_dims(y_true, 1)
		y_pred = np.expand_dims(y_pred, 1)
		else:
		y_pred = np.reshape(y_pred, (n_samples, n_tasks))
		y_true = np.reshape(y_true, (n_samples, n_tasks))
		n_tasks = y_pred.shape[1]
		if w is None or len(w) == 0:
		w = np.ones_like(y_true)
		assert y_true.shape[0] == y_pred.shape[0] == w.shape[0]
		w = np.ones((n_samples, n_tasks))
		computed_metrics = []
		for task in range(n_tasks):
		y_task = y_true[:, task]
		if self.mode == "regression":
		y_pred_task = y_pred[:, task]
		else:
		y_pred_task = y_pred[:, task, :]
		w_task = w[:, task]

		metric_value = self.compute_singletask_metric(y_task, y_pred_task, w_task)
		@@ -271,26 +274,12 @@ class Metric(object):
		# If there are no nonzero examples, metric is ill-defined.
		if not y_true.size:
		return np.nan

		y_true = np.reshape(y_true, (n_samples,))
		if self.mode == "classification":
		n_classes = y_pred.shape[-1]
		# TODO(rbharath): This has been a major source of bugs. Is there a more
		# robust characterization of which metrics require class-probs and which
		# don't?
		if "roc_auc_score" in self.name or "prc_auc_score" in self.name:
		y_true = to_one_hot(y_true).astype(int)
		y_pred = np.reshape(y_pred, (n_samples, n_classes))
		else:
		y_true = y_true.astype(int)
		# Reshape to handle 1-d edge cases
		y_pred = np.reshape(y_pred, (n_samples, n_classes))
		y_pred = from_one_hot(y_pred)
		else:
		y_pred = np.reshape(y_pred, (n_samples,))

		if self.threshold is not None:
		y_pred = np.greater(y_pred, threshold)
		if len(y_true.shape) == 0:
		y_true = np.expand_dims(y_true, 0)
		if len(y_pred.shape) == 0:
		y_pred = np.expand_dims(y_pred, 0)
		try:
		metric_value = self.metric(y_true, y_pred)
		except (AssertionError, ValueError) as e:

deepchem/utils/evaluate.py

+7 −18

Original line number	Diff line number	Diff line
		@@ -178,6 +178,8 @@ class GeneratorEvaluator(object):
		self.weights = weights
		if len(self.label_keys) != len(self.output_keys):
		raise ValueError("Must have same number of labels and outputs")
		if len(self.label_keys) != 1:
		raise ValueError("GeneratorEvaluator currently only supports one label")

		def compute_model_performance(self, metrics, per_task_metrics=False):
		"""
		@@ -196,30 +198,17 @@ class GeneratorEvaluator(object):

		def generator_closure():
		for feed_dict in self.generator:
		labels = []
		for layer in self.label_keys:
		labels.append(feed_dict[layer])
		del feed_dict[layer]
		for weight in self.weights:
		w.append(feed_dict[weight])
		del feed_dict[weight]
		y.append(np.array(labels))
		y.append(feed_dict[self.label_keys[0]])
		if len(self.weights) > 0:
		w.append(feed_dict[self.weights[0]])
		yield feed_dict

		if not len(metrics):
		return {}
		else:
		mode = metrics[0].mode
		if mode == "classification":
		y_pred = self.model.predict_on_generator(generator_closure())
		y = np.transpose(np.array(y), axes=[0, 2, 1, 3])
		y = np.reshape(y, newshape=(-1, self.n_tasks, self.n_classes))
		y = from_one_hot(y, axis=-1)
		else:
		y_pred = self.model.predict_on_generator(generator_closure())
		y = np.transpose(np.array(y), axes=[0, 2, 1, 3])
		y = np.reshape(y, newshape=(-1, self.n_tasks))
		y_pred = np.reshape(y_pred, newshape=(-1, self.n_tasks))
		y = np.concatenate(y, axis=0)
		multitask_scores = {}
		all_task_scores = {}

deepchem/utils/test/test_generator_evaluator.py

+44 −90

Original line number	Diff line number	Diff line
		@@ -5,7 +5,7 @@ import numpy as np
		from deepchem.data import NumpyDataset
		from deepchem.data.datasets import Databag
		from deepchem.models.tensorgraph.layers import Dense, ReduceMean, SoftMax, SoftMaxCrossEntropy
		from deepchem.models.tensorgraph.layers import Feature, Label
		from deepchem.models.tensorgraph.layers import Feature, Label, Reshape
		from deepchem.models.tensorgraph.layers import ReduceSquareDifference
		from nose.tools import assert_true
		from flaky import flaky
		@@ -17,6 +17,8 @@ class TestGeneratorEvaluator(TestCase):
		def test_compute_model_performance_multitask_classifier(self):
		n_data_points = 20
		n_features = 1
		n_tasks = 2
		n_classes = 2

		X = np.ones(shape=(n_data_points // 2, n_features)) * -1
		X1 = np.ones(shape=(n_data_points // 2, n_features))
		@@ -25,43 +27,29 @@ class TestGeneratorEvaluator(TestCase):
		class_0 = np.array([[1.0, 0.0] for x in range(int(n_data_points / 2))])
		y1 = np.concatenate((class_0, class_1))
		y2 = np.concatenate((class_1, class_0))
		X = NumpyDataset(X)
		ys = [NumpyDataset(y1), NumpyDataset(y2)]

		databag = Databag()
		y = np.stack([y1, y2], axis=1)
		dataset = NumpyDataset(X, y)

		features = Feature(shape=(None, n_features))
		databag.add_dataset(features, X)

		outputs = []
		entropies = []
		labels = []
		for i in range(2):
		label = Label(shape=(None, 2))
		labels.append(label)
		dense = Dense(out_channels=2, in_layers=[features])
		output = SoftMax(in_layers=[dense])
		smce = SoftMaxCrossEntropy(in_layers=[label, dense])

		entropies.append(smce)
		outputs.append(output)
		databag.add_dataset(label, ys[i])

		total_loss = ReduceMean(in_layers=entropies)
		label = Label(shape=(None, n_tasks, n_classes))
		dense = Dense(out_channels=n_tasks * n_classes, in_layers=[features])
		logits = Reshape(shape=(None, n_tasks, n_classes), in_layers=dense)
		output = SoftMax(in_layers=[logits])
		smce = SoftMaxCrossEntropy(in_layers=[label, logits])
		total_loss = ReduceMean(in_layers=smce)

		tg = dc.models.TensorGraph(learning_rate=0.01, batch_size=n_data_points)
		for output in outputs:
		tg.add_output(output)
		tg.set_loss(total_loss)

		tg.fit_generator(
		databag.iterbatches(
		epochs=1000, batch_size=tg.batch_size, pad_batches=True))
		tg.fit(dataset, nb_epoch=1000)
		metric = dc.metrics.Metric(
		dc.metrics.roc_auc_score, np.mean, mode="classification")

		scores = tg.evaluate_generator(
		databag.iterbatches(), [metric], labels=labels, per_task_metrics=True)
		tg.default_generator(dataset), [metric],
		labels=[label],
		per_task_metrics=True)
		scores = list(scores[1].values())
		# Loosening atol to see if tests stop failing sporadically
		assert_true(np.all(np.isclose(scores, [1.0, 1.0], atol=0.50)))
		@@ -75,44 +63,28 @@ class TestGeneratorEvaluator(TestCase):
		X = np.concatenate((X, X1))
		class_1 = np.array([[0.0, 1.0] for x in range(int(n_data_points / 2))])
		class_0 = np.array([[1.0, 0.0] for x in range(int(n_data_points / 2))])
		y1 = np.concatenate((class_0, class_1))
		X = NumpyDataset(X)
		ys = [NumpyDataset(y1)]

		databag = Databag()
		y = np.concatenate((class_0, class_1))
		dataset = NumpyDataset(X, y)

		features = Feature(shape=(None, n_features))
		databag.add_dataset(features, X)

		outputs = []
		entropies = []
		labels = []
		for i in range(1):
		label = Label(shape=(None, 2))
		labels.append(label)
		dense = Dense(out_channels=2, in_layers=[features])
		output = SoftMax(in_layers=[dense])
		smce = SoftMaxCrossEntropy(in_layers=[label, dense])

		entropies.append(smce)
		outputs.append(output)
		databag.add_dataset(label, ys[i])

		total_loss = ReduceMean(in_layers=entropies)
		total_loss = ReduceMean(in_layers=smce)

		tg = dc.models.TensorGraph(learning_rate=0.1)
		for output in outputs:
		tg.add_output(output)
		tg.set_loss(total_loss)

		tg.fit_generator(
		databag.iterbatches(
		epochs=1000, batch_size=tg.batch_size, pad_batches=True))
		tg.fit(dataset, nb_epoch=1000)
		metric = dc.metrics.Metric(
		dc.metrics.roc_auc_score, np.mean, mode="classification")

		scores = tg.evaluate_generator(
		databag.iterbatches(), [metric], labels=labels, per_task_metrics=True)
		tg.default_generator(dataset), [metric],
		labels=[label],
		per_task_metrics=True)
		scores = list(scores[1].values())
		assert_true(np.isclose(scores, [1.0], atol=0.05))

		@@ -120,52 +92,34 @@ class TestGeneratorEvaluator(TestCase):
		random_seed = 42
		n_data_points = 20
		n_features = 2
		n_tasks = 2
		np.random.seed(seed=random_seed)

		X = np.random.rand(n_data_points, n_features)
		y1 = np.expand_dims(np.array([0.5 for x in range(n_data_points)]), axis=-1)
		y2 = np.expand_dims(np.array([-0.5 for x in range(n_data_points)]), axis=-1)
		X = NumpyDataset(X)
		ys = [NumpyDataset(y1), NumpyDataset(y2)]

		databag = Databag()
		y1 = np.array([0.5 for x in range(n_data_points)])
		y2 = np.array([-0.5 for x in range(n_data_points)])
		y = np.stack([y1, y2], axis=1)
		dataset = NumpyDataset(X, y)

		features = Feature(shape=(None, n_features))
		databag.add_dataset(features, X)

		outputs = []
		losses = []
		labels = []
		for i in range(2):
		label = Label(shape=(None, 1))
		dense = Dense(out_channels=1, in_layers=[features])
		label = Label(shape=(None, n_tasks))
		dense = Dense(out_channels=n_tasks, in_layers=[features])
		loss = ReduceSquareDifference(in_layers=[dense, label])

		outputs.append(dense)
		losses.append(loss)
		labels.append(label)
		databag.add_dataset(label, ys[i])

		total_loss = ReduceMean(in_layers=losses)

		tg = dc.models.TensorGraph(
		mode="regression",
		batch_size=20,
		random_seed=random_seed,
		learning_rate=0.1)
		for output in outputs:
		tg.add_output(output)
		tg.set_loss(total_loss)
		tg = dc.models.TensorGraph(random_seed=random_seed, learning_rate=0.1)
		tg.add_output(dense)
		tg.set_loss(loss)

		tg.fit_generator(
		databag.iterbatches(
		epochs=1000, batch_size=tg.batch_size, pad_batches=True))
		tg.fit(dataset, nb_epoch=1000)
		metric = [
		dc.metrics.Metric(
		dc.metrics.mean_absolute_error, np.mean, mode="regression"),
		]
		scores = tg.evaluate_generator(
		databag.iterbatches(), metric, labels=labels, per_task_metrics=True)
		tg.default_generator(dataset),
		metric,
		labels=[label],
		per_task_metrics=True)
		scores = list(scores[1].values())
		assert_true(np.all(np.isclose(scores, [0.0, 0.0], atol=1.0)))

examples/low_data/toxcast_maml.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -108,7 +108,7 @@ def compute_scores(optimize):
		print()
		print('Cross entropy loss:', np.mean(losses))
		print('Prediction accuracy:', accuracy_score(y_true, y_pred > 0.5))
		print('ROC AUC:', dc.metrics.compute_roc_auc_scores(y_true, y_pred))
		print('ROC AUC:', dc.metrics.roc_auc_scores(y_true, y_pred))
		print()

Admin message