Fixes (b3463f05) · Commits · 钟慕尧 / deepchem

deepchem/metrics/init.py

+46 −26

Original line number	Diff line number	Diff line
		@@ -4,6 +4,7 @@ import numpy as np
		import warnings
		import sklearn.metrics
		import logging
		# TODO: Imported metrics will be removed in a futrue version of DeepCHem
		from sklearn.metrics import matthews_corrcoef
		from sklearn.metrics import recall_score
		from sklearn.metrics import r2_score
		@@ -108,7 +109,7 @@ def normalize_weight_shape(w, n_samples, n_tasks):



		def normalize_prediction_shape(y, mode="classification", n_classes=None):
		def normalize_prediction_shape(y, mode=None, n_classes=None):
		"""A utility function to correct the shape of the input array.

		The metric computation classes expect that inputs for classification
		@@ -133,9 +134,12 @@ def normalize_prediction_shape(y, mode="classification", n_classes=None):
		must take values from `0` to `n_classes-1` as integers. If
		`mode=="regression"`, `y` is an array of shape `(N,)` or `(N,
		n_tasks)`or `(N, n_tasks, 1)`. In the edge case where `N == 1`,
		`y` may be a scalar.
		mode: str
		Must be either "classification" or "regression".
		`y` may be a scalar. If `mode` is None, then `y` can be of any
		shape and is returned unchanged.
		mode: str, optional (default None)
		If `mode` is "classification" or "regression", attempts to apply
		data transformations. For other modes, performs no transformations
		to data and returns as-is.
		n_classes: int, optional
		If specified use this as the number of classes. Else will try to
		impute it as `n_classes = max(y) + 1` for arrays and as
		@@ -149,6 +153,7 @@ def normalize_prediction_shape(y, mode="classification", n_classes=None):
		n_tasks, n_classes)`. If `mode=="regression"`, `y_out` is an array
		of shape `(N, n_tasks)`.
		"""
		if mode == "classification":
		if n_classes is None:
		if isinstance(y, np.ndarray):
		# Find number of classes. Note that `y` must have values in
		@@ -157,7 +162,6 @@ def normalize_prediction_shape(y, mode="classification", n_classes=None):
		else:
		# scalar case
		n_classes = 2
		if mode == "classification":
		if isinstance(y, np.ndarray):
		if len(y.shape) == 1:
		# y_hot is of shape (N, n_classes)
		@@ -201,6 +205,10 @@ def normalize_prediction_shape(y, mode="classification", n_classes=None):
		raise ValueError("y must a float sclar or a ndarray of shape `(N,)` or `(N, n_tasks)` or `(N, n_tasks, 1)` for regression problems.")
		y = np.array(y)
		y_out = np.reshape(y, (1, 1))
		else:
		# If mode isn't classification or regression don't perform any
		# transformations.
		y_out = y
		return y_out

		def to_one_hot(y, n_classes=2):
		@@ -454,7 +462,7 @@ class Metric(object):
		name=None,
		threshold=None,
		mode=None,
		**kwargs):
		compute_energy_metric=None):
		"""
		Parameters
		----------
		@@ -464,17 +472,21 @@ class Metric(object):
		task_averager: function, optional
		If not None, should be a function that averages metrics across
		tasks. For example, task_averager=np.mean. If task_averager is
		provided, this task will be inherited as a multitask metric.
		name: str, optional
		provided, this metric will be assumed to be multitask and
		`self.is_multitask` will be set to True.
		name: str, optional (default None)
		Name of this metric
		threshold: float, optional
		threshold: float, optional (default None)
		Used for binary metrics and is the threshold for the positive
		class
		mode: str, optional
		Must be either classification or regression.
		class.
		mode: str, optional (default None)
		Should usually be "classification" or "regression."
		compute_energy_metric: bool, optional (default None)
		Deprecated metric. Will be removed in a future version of
		DeepChem. Do not use.
		"""
		if "compute_energy_metric" in kwargs:
		self.compute_energy_metric = kwargs["compute_energy_metric"]
		if compute_energy_metric is not None:
		self.compute_energy_metric = compute_energy_metric
		logger.warn("compute_energy_metric is deprecated and will be removed in a future version of DeepChem.")
		else:
		self.compute_energy_metric = False
		@@ -483,13 +495,20 @@ class Metric(object):
		self.is_multitask = (self.task_averager is not None)
		if name is None:
		if not self.is_multitask:
		if hasattr(self.metric, '__name__'):
		self.name = self.metric.__name__
		else:
		self.name = "unknown metric"
		else:
		if hasattr(self.metric, '__name__'):
		self.name = self.task_averager.__name__ + "-" + self.metric.__name__
		else:
		self.name = "unknown metric"
		else:
		self.name = name
		self.threshold = threshold
		if mode is None:
		# These are some smart defaults
		if self.metric.__name__ in [
		"roc_auc_score", "matthews_corrcoef", "recall_score",
		"accuracy_score", "kappa_score", "precision_score",
		@@ -502,11 +521,12 @@ class Metric(object):
		]:
		mode = "regression"
		else:
		raise ValueError("Must specify mode for new metric.")
		assert mode in ["classification", "regression"]
		logger.info("Support for non classification/regression metrics is new. Check your results carefully.")
		# Attempts to set threshold defaults intelligently
		if self.metric.__name__ in [
		"accuracy_score", "balanced_accuracy_score", "recall_score",
		"matthews_corrcoef", "precision_score", "f1_score"
		"matthews_corrcoef", "roc_auc_score", "precision_score",
		"f1_score"
		] and threshold is None:
		self.threshold = 0.5
		self.mode = mode

deepchem/utils/evaluate.py

+137 −40

Original line number	Diff line number	Diff line
		@@ -14,6 +14,54 @@ from deepchem.metrics import Metric
		logger = logging.getLogger(__name__)


		def output_statistics(scores, stats_out):
		"""Write computed stats to file.

		Statistics are written to specified `stats_out` file.

		Parameters
		----------
		scores: dict
		Dictionary mapping names of metrics to scores.
		stats_out: str
		Name of file to write scores to.
		"""
		logger.warning("output_statistics is deprecated.")
		with open(stats_out, "w") as statsfile:
		statsfile.write(str(scores) + "\n")


		def output_predictions(dataset, y_preds, csv_out):
		"""Writes predictions to file.

		Writes predictions made on `dataset` to a specified file on
		disk. `dataset.ids` are used to format predictions. The produce CSV file will have format as follows

		\| ID \| Task1Name \| Task2Name \|
		\| ----------- \| ------------ \| ------------ \|
		\| identifer1 \| prediction11 \| prediction12 \|
		\| identifer2 \| prediction21 \| prediction22 \|

		Parameters
		----------
		dataset: dc.data.Dataset
		Dataset on which predictions have been made.
		y_preds: np.ndarray
		Predictions to output
		csv_out: str
		Name of file to write predictions to.
		"""
		mol_ids = dataset.ids
		n_tasks = len(dataset.get_task_names())
		y_preds = np.reshape(y_preds, (len(y_preds), n_tasks))
		assert len(y_preds) == len(mol_ids)
		with open(csv_out, "w") as csvfile:
		csvwriter = csv.writer(csvfile)
		csvwriter.writerow(["ID"] + dataset.get_task_names())
		for mol_id, y_pred in zip(mol_ids, y_preds):
		csvwriter.writerow([mol_id] + list(y_pred))


		def _process_metric_input(metrics):
		"""A private helper method which processes metrics correctly.

		@@ -41,7 +89,7 @@ def _process_metric_input(metrics):
		`dc.metrics.Metric` objects.
		"""
		# Make sure input is a list
		if not len(metrics):
		if not isinstance(metrics, list):
		metrics = [metrics]
		final_metrics = []
		for i, metric in enumerate(metrics):
		@@ -51,7 +99,7 @@ def _process_metric_input(metrics):
		# This case checks if input is a function then wraps a
		# dc.metrics.Metric object around it
		elif callable(metric):
		wrap_metric = Metric(metric, name="metric-%d" % i)
		wrap_metric = Metric(metric, name="metric-%d" % (i + 1))
		final_metrics.append(wrap_metric)
		else:
		raise ValueError(
		@@ -114,15 +162,29 @@ class Evaluator(object):

		Example
		-------
		Evaluators allow for a model to be evaluated directly on a Metric
		for `sklearn`. Let's do a bit of setup constructing our dataset and
		model.

		>>> import numpy as np
		>>> X = np.random.rand(10, 5)
		>>> y = np.random.rand(10, 1)
		>>> dataset = dc.data.NumpyDataset(X, y)
		>>> model = dc.models.MultitaskRegressor(1, 5)
		>>> transformers = []

		Then you can evaluate this model as follows
		>>> import sklearn
		>>> evaluator = Evaluator(model, dataset, transformers)
		>>> multitask_scores = evaluator.compute_model_performance(
		... sklearn.metrics.mean_absolute_error)

		Evaluators can also be used with `dc.metrics.Metric` objects as well
		in case you want to customize your metric further.

		>>> evaluator = Evaluator(model, dataset, transformers)
		>>> metric = dc.metrics.Metric(dc.metrics.mae_score)
		>>> multitask_scores = evaluator.compute_model_performance([metric])
		>>> multitask_scores = evaluator.compute_model_performance(metric)
		"""

		def __init__(self, model, dataset, transformers):
		@@ -131,7 +193,6 @@ class Evaluator(object):
		self.output_transformers = [
		transformer for transformer in transformers if transformer.transform_y
		]
		self.task_names = dataset.get_task_names()

		def output_statistics(self, scores, stats_out):
		""" Write computed stats to file.
		@@ -143,26 +204,35 @@ class Evaluator(object):
		stats_out: str
		Name of file to write scores to.
		"""
		logger.warning(
		"Evaluator.output_statistics is deprecated. Please use dc.utils.evaluate.output_statistics instead. This method will be removed in a future version of DeepChem."
		)
		with open(stats_out, "w") as statsfile:
		statsfile.write(str(scores) + "\n")

		def output_predictions(self, y_preds, csv_out):
		"""Writes predictions to file.

		Writes predictions made on `self.dataset` to a specified file on
		disk. `self.dataset.ids` are used to format predictions.

		Parameters
		----------
		y_preds: np.ndarray
		Predictions to output
		csvfile: str
		csv_out: str
		Name of file to write predictions to.
		"""
		logger.warning(
		"Evaluator.output_predictions is deprecated. Please use dc.utils.evaluate.output_predictions instead. This method will be removed in a future version of DeepChem."
		)
		mol_ids = self.dataset.ids
		n_tasks = len(self.task_names)
		n_tasks = len(self.dataset.get_task_names())
		y_preds = np.reshape(y_preds, (len(y_preds), n_tasks))
		assert len(y_preds) == len(mol_ids)
		with open(csv_out, "w") as csvfile:
		csvwriter = csv.writer(csvfile)
		csvwriter.writerow(["Compound"] + self.dataset.get_task_names())
		csvwriter.writerow(["ID"] + self.dataset.get_task_names())
		for mol_id, y_pred in zip(mol_ids, y_preds):
		csvwriter.writerow([mol_id] + list(y_pred))

		@@ -170,17 +240,29 @@ class Evaluator(object):
		metrics,
		csv_out=None,
		stats_out=None,
		per_task_metrics=False):
		per_task_metrics=False,
		n_classes=None):
		"""
		Computes statistics of model on test data and saves results to csv.

		Parameters
		----------
		metrics: list
		List of dc.metrics.Metric objects
		csv_out: str, optional
		metrics: dc.metrics.Metric/list[dc.metrics.Metric]/function
		The set of metrics provided. This class attempts to do some
		intelligent handling of input. If a single `dc.metrics.Metric`
		object is provided or a list is provided, it will evaluate
		`self.model` on these metrics. If a function is provided, it is
		assumed to be a metric function that this method will attempt to
		wrap in a `dc.metrics.Metric` object. A metric function must
		accept two arguments, `y_true, y_pred` both of which are
		`np.ndarray` objects and return a floating point score.
		n_classes: int, optional (default None)
		If specified, will assume that all `metrics` are classification
		metrics and will use `n_classes` as the number of unique classes
		in `self.dataset`.
		csv_out: str, optional (Deprecated)
		Filename to write CSV of model predictions.
		stats_out: str, optional
		stats_out: str, optional (Deprecated)
		Filename to write computed statistics.
		per_task_metrics: bool, optional
		If true, return computed metric for each task on multitask dataset.
		@@ -193,39 +275,35 @@ class Evaluator(object):
		If `per_task_metrics == True`, then returns a second dictionary
		of scores for each task separately.
		"""
		if csv_out is not None:
		logger.warning(
		"csv_out is deprecated as an argument and will be removed in a future version of DeepChem. Output is not written to CSV; manually write output instead."
		)
		if stats_out is not None:
		logger.warning(
		"stats_out is deprecated as an argument and will be removed in a future version of DeepChem. Stats output is not written; please manually write output instead"
		)
		# Process input metrics
		metrics = _process_metric_input(metrics)

		y = self.dataset.y
		y = undo_transforms(y, self.output_transformers)
		w = self.dataset.w

		if not len(metrics):
		return {}
		else:
		mode = metrics[0].mode
		y_pred = self.model.predict(self.dataset, self.output_transformers)
		if mode == "classification":
		y_pred_print = np.argmax(y_pred, -1)
		else:
		y_pred_print = y_pred

		multitask_scores = {}
		all_task_scores = {}

		if csv_out is not None:
		logger.info("Saving predictions to %s" % csv_out)
		self.output_predictions(y_pred_print, csv_out)

		# Compute multitask metrics
		for metric in metrics:
		results = metric.compute_metric(
		y, y_pred, w, per_task_metrics=per_task_metrics, n_classes=n_classes)
		if per_task_metrics:
		multitask_scores[metric.name], computed_metrics = metric.compute_metric(
		y, y_pred, w, per_task_metrics=True)
		multitask_scores[metric.name], computed_metrics = results
		all_task_scores[metric.name] = computed_metrics
		else:
		multitask_scores[metric.name] = metric.compute_metric(
		y, y_pred, w, per_task_metrics=False)

		if stats_out is not None:
		logger.info("Saving stats to %s" % stats_out)
		self.output_statistics(multitask_scores, stats_out)
		multitask_scores[metric.name] = results

		if not per_task_metrics:
		return multitask_scores
		@@ -247,10 +325,21 @@ class GeneratorEvaluator(object):
		>>> y = np.random.rand(10, 1)
		>>> dataset = dc.data.NumpyDataset(X, y)
		>>> model = dc.models.MultitaskRegressor(1, 5)
		>>> transformers = []
		>>> generator = model.default_generator(dataset, pad_batches=False)
		>>> evaluator = Evaluator(model, generator, transformers)
		>>> multitask_scores = evaluator.compute_model_performance([metric])

		Then you can evaluate this model as follows

		>>> import sklearn
		>>> evaluator = GeneratorEvaluator(model, generator, transformers)
		>>> multitask_scores = evaluator.compute_model_performance(
		... sklearn.metrics.mean_absolute_error)

		Evaluators can also be used with `dc.metrics.Metric` objects as well
		in case you want to customize your metric further.

		>>> evaluator = GeneratorEvaluator(model, dataset, transformers)
		>>> metric = dc.metrics.Metric(dc.metrics.mae_score)
		>>> multitask_scores = evaluator.compute_model_performance(metric)
		"""

		def __init__(self, model, generator, transformers, labels=None, weights=None):
		@@ -281,7 +370,10 @@ class GeneratorEvaluator(object):
		if labels is not None and len(labels) != 1:
		raise ValueError("GeneratorEvaluator currently only supports one label")

		def compute_model_performance(self, metrics, per_task_metrics=False):
		def compute_model_performance(self,
		metrics,
		per_task_metrics=False,
		n_classes=None):
		"""
		Computes statistics of model on test data and saves results to csv.

		@@ -299,6 +391,10 @@ class GeneratorEvaluator(object):
		per_task_metrics: bool, optional
		If true, return computed metric for each task on multitask
		dataset.
		n_classes: int, optional (default None)
		If specified, will assume that all `metrics` are classification
		metrics and will use `n_classes` as the number of unique classes
		in `self.dataset`.

		Returns
		-------
		@@ -315,6 +411,7 @@ class GeneratorEvaluator(object):
		w = []

		def generator_closure():
		"""This function is used to pull true labels/weights out as we iterate over the generator."""
		if self.label_keys is None:
		weights = None
		# This is a KerasModel.
		@@ -350,13 +447,13 @@ class GeneratorEvaluator(object):

		# Compute multitask metrics
		for metric in metrics:
		results = metric.compute_metric(
		y, y_pred, w, per_task_metrics=per_task_metrics)
		if per_task_metrics:
		multitask_scores[metric.name], computed_metrics = metric.compute_metric(
		y, y_pred, w, per_task_metrics=True)
		multitask_scores[metric.name], computed_metrics = results
		all_task_scores[metric.name] = computed_metrics
		else:
		multitask_scores[metric.name] = metric.compute_metric(
		y, y_pred, w, per_task_metrics=False)
		multitask_scores[metric.name] = results

		if not per_task_metrics:
		return multitask_scores

deepchem/utils/test/test_evaluate.py

+66 −18

Original line number	Diff line number	Diff line
		@@ -2,6 +2,7 @@
		import deepchem as dc
		import numpy as np
		import unittest
		import sklearn
		from deepchem.utils.evaluate import Evaluator
		from deepchem.utils.evaluate import GeneratorEvaluator

		@@ -12,40 +13,72 @@ class TestEvaluator(unittest.TestCase):
		X = np.random.rand(10, 5)
		y = np.random.rand(10, 1)
		self.dataset = dc.data.NumpyDataset(X, y)
		self.model = dc.models.MultitaskRegressor(1, 5)

		def test_evaluator_dc_metric(self):
		"""Test an evaluator on a dataset."""
		model = dc.models.MultitaskRegressor(1, 5)
		transformers = []
		evaluator = Evaluator(model, self.dataset, transformers)
		evaluator = Evaluator(self.model, self.dataset, [])
		metric = dc.metrics.Metric(dc.metrics.mae_score)
		multitask_scores = evaluator.compute_model_performance([metric])
		multitask_scores = evaluator.compute_model_performance(metric)
		assert isinstance(multitask_scores, dict)
		assert len(multitask_scores) == 1
		assert multitask_scores['mae_score'] > 0

		def test_evaluator_dc_multi_metric(self):
		"""Test an evaluator on a dataset."""
		evaluator = Evaluator(self.model, self.dataset, [])
		metric1 = dc.metrics.Metric(dc.metrics.mae_score)
		metric2 = dc.metrics.Metric(dc.metrics.r2_score)
		multitask_scores = evaluator.compute_model_performance(
		[metric1, metric2])
		assert isinstance(multitask_scores, dict)
		assert len(multitask_scores) == 2
		assert multitask_scores['mae_score'] > 0
		assert "r2_score" in multitask_scores


		def test_evaluator_sklearn_metric(self):
		"""Test an evaluator on a dataset."""
		evaluator = Evaluator(self.model, self.dataset, [])
		multitask_scores = evaluator.compute_model_performance(
		sklearn.metrics.mean_absolute_error)
		assert isinstance(multitask_scores, dict)
		assert len(multitask_scores) == 1
		# Note that since no name as provided, metrics are index by order
		# given.
		assert multitask_scores['metric-1'] > 0

		def test_evaluator_sklearn_multi_metric(self):
		"""Test an evaluator on a dataset."""
		evaluator = Evaluator(self.model, self.dataset, [])
		multitask_scores = evaluator.compute_model_performance(
		[sklearn.metrics.mean_absolute_error,
		sklearn.metrics.r2_score])
		assert isinstance(multitask_scores, dict)
		assert len(multitask_scores.keys()) == 2
		# Note that since no name as provided, metrics are index by order
		# given.
		assert multitask_scores['metric-1'] > 0
		assert "metric-2" in multitask_scores

		def test_generator_evaluator_dc_metric_multitask(self):
		"""Test generator evaluator on a generator."""
		model = dc.models.MultitaskRegressor(1, 5)
		generator = model.default_generator(
		generator = self.model.default_generator(
		self.dataset, pad_batches=False)
		transformers = []
		evaluator = GeneratorEvaluator(model, generator, transformers)
		evaluator = GeneratorEvaluator(self.model, generator, [])
		metric = dc.metrics.Metric(dc.metrics.mae_score)
		multitask_scores = evaluator.compute_model_performance([metric])
		multitask_scores = evaluator.compute_model_performance(metric)
		assert isinstance(multitask_scores, dict)
		assert len(multitask_scores) == 1
		assert multitask_scores['mae_score'] > 0

		def test_generator_evaluator_dc_metric_multitask_single_point(self):
		"""Test generator evaluator on a generator."""
		model = dc.models.MultitaskRegressor(1, 5)
		generator = model.default_generator(
		generator = self.model.default_generator(
		self.dataset, pad_batches=False)
		transformers = []
		evaluator = GeneratorEvaluator(model, generator, transformers)
		evaluator = GeneratorEvaluator(self.model, generator, [])
		metric = dc.metrics.Metric(dc.metrics.mae_score)
		multitask_scores = evaluator.compute_model_performance([metric])
		multitask_scores = evaluator.compute_model_performance(metric)
		assert isinstance(multitask_scores, dict)
		assert len(multitask_scores) == 1
		print("multitask_scores")
		@@ -54,11 +87,26 @@ class TestEvaluator(unittest.TestCase):

		def test_evaluator_dc_metric_singletask(self):
		"""Test an evaluator on a dataset."""
		model = dc.models.MultitaskRegressor(1, 5)
		transformers = []
		evaluator = Evaluator(model, self.dataset, transformers)
		evaluator = Evaluator(self.model, self.dataset, [])
		metric = dc.metrics.Metric(dc.metrics.mae_score)
		multitask_scores = evaluator.compute_model_performance([metric])
		multitask_scores = evaluator.compute_model_performance(metric)
		assert isinstance(multitask_scores, dict)
		assert len(multitask_scores) == 1
		assert multitask_scores['mae_score'] > 0

		def test_multiclass_classification_singletask(self):
		"""Test multiclass classification evaluation."""
		X = np.random.rand(100, 5)
		y = np.random.randint(5, size=(100,))
		dataset = dc.data.NumpyDataset(X, y)
		model = dc.models.MultitaskClassifier(1, 5, n_classes=5)
		evaluator = Evaluator(model, dataset, [])
		multitask_scores = evaluator.compute_model_performance(
		sklearn.metrics.accuracy_score, n_classes=5)
		assert len(multitask_scores) == 1
		assert multitask_scores["metric-1"] >= 0

		# TODO: Add a multtiask metrics example
		# TODO: Add a multitask per-task metric example
		# TODO: Add metrics for images here as a test

Admin message