Changes (2fac2a50) · Commits · 钟慕尧 / deepchem

deepchem/metrics/init.py

+49 −44

Original line number	Diff line number	Diff line
		@@ -6,6 +6,7 @@ import sklearn.metrics
		import logging
		from sklearn.metrics import matthews_corrcoef
		from sklearn.metrics import recall_score
		from sklearn.metrics import cohen_kappa_score
		from sklearn.metrics import r2_score
		from sklearn.metrics import mean_squared_error
		from sklearn.metrics import mean_absolute_error
		@@ -505,47 +506,50 @@ def mae_score(y_true, y_pred):
		return mean_absolute_error(y_true, y_pred)


		def kappa_score(y_true, y_pred):
		"""Calculate Cohen's kappa for classification tasks.

		See https://en.wikipedia.org/wiki/Cohen%27s_kappa

		Note that this implementation of Cohen's kappa expects binary labels.

		Parameters
		----------
		y_true: np.ndarray
		Numpy array containing true values of shape `(N,)`
		y_pred: np.ndarray
		Numpy array containing predicted values of shape `(N,)`

		Returns
		-------
		kappa: np.ndarray
		Numpy array containing kappa for each classification task.

		Raises
		------
		AssertionError: If y_true and y_pred are not the same size, or if
		class labels are not in [0, 1].
		"""
		assert len(y_true) == len(y_pred), 'Number of examples does not match.'
		yt = np.asarray(y_true, dtype=int)
		yp = np.asarray(y_pred, dtype=int)
		if not set(np.unique(yt)).issubset(set([0, 1])):
		raise ValueError("Class labels must be binary 0, 1")
		assert np.array_equal(
		np.unique(yt),
		[0, 1]), ('Class labels must be binary: %s' % np.unique(yt))
		observed_agreement = np.true_divide(
		np.count_nonzero(np.equal(yt, yp)), len(yt))
		expected_agreement = np.true_divide(
		np.count_nonzero(yt == 1) * np.count_nonzero(yp == 1) +
		np.count_nonzero(yt == 0) * np.count_nonzero(yp == 0),
		len(yt)**2)
		kappa = np.true_divide(observed_agreement - expected_agreement,
		1.0 - expected_agreement)
		return kappa
		# kappa_score is an alias for `sklearn.metrics.cohen_kappa_score`
		kappa_score = cohen_kappa_score

		#def kappa_score(y_true, y_pred):
		# """Calculate Cohen's kappa for classification tasks.
		#
		# See https://en.wikipedia.org/wiki/Cohen%27s_kappa
		#
		# Note that this implementation of Cohen's kappa expects binary labels.
		#
		# Parameters
		# ----------
		# y_true: np.ndarray
		# Numpy array containing true values of shape `(N,)`
		# y_pred: np.ndarray
		# Numpy array containing predicted values of shape `(N,)`
		#
		# Returns
		# -------
		# kappa: np.ndarray
		# Numpy array containing kappa for each classification task.
		#
		# Raises
		# ------
		# AssertionError: If y_true and y_pred are not the same size, or if
		# class labels are not in [0, 1].
		# """
		# assert len(y_true) == len(y_pred), 'Number of examples does not match.'
		# yt = np.asarray(y_true, dtype=int)
		# yp = np.asarray(y_pred, dtype=int)
		# if not set(np.unique(yt)).issubset(set([0, 1])):
		# raise ValueError("Class labels must be binary 0, 1")
		# assert np.array_equal(
		# np.unique(yt),
		# [0, 1]), ('Class labels must be binary: %s' % np.unique(yt))
		# observed_agreement = np.true_divide(
		# np.count_nonzero(np.equal(yt, yp)), len(yt))
		# expected_agreement = np.true_divide(
		# np.count_nonzero(yt == 1) * np.count_nonzero(yp == 1) +
		# np.count_nonzero(yt == 0) * np.count_nonzero(yp == 0),
		# len(yt)**2)
		# kappa = np.true_divide(observed_agreement - expected_agreement,
		# 1.0 - expected_agreement)
		# return kappa


		def bedroc_score(y_true, y_pred, alpha=20.0):
		@@ -705,6 +709,7 @@ class Metric(object):
		"recall_score",
		"accuracy_score",
		"kappa_score",
		"cohen_kappa_score",
		"precision_score",
		"balanced_accuracy_score",
		"prc_auc_score",
		@@ -719,9 +724,9 @@ class Metric(object):
		# behavior
		if classification_handling_mode is None:
		if self.metric.__name__ in [
		"matthews_corrcoef", "kappa_score", "balanced_accuracy_score",
		"recall_score", "jaccard_score", "jaccard_index", "pixel_error",
		"f1_score"
		"matthews_corrcoef", "cohen_kappa_score", "kappa_score",
		"balanced_accuracy_score", "recall_score", "jaccard_score",
		"jaccard_index", "pixel_error", "f1_score"
		]:
		classification_handling_mode = "threshold"
		elif self.metric.__name__ in [

deepchem/models/models.py

+16 −2

Original line number	Diff line number	Diff line
		@@ -188,7 +188,9 @@ class Model(BaseEstimator):
		dataset: Dataset,
		metrics: List[Metric],
		transformers: List[Transformer] = [],
		per_task_metrics: bool = False):
		per_task_metrics: bool = False,
		use_sample_weights: bool = False,
		n_classes: int = 2):
		"""
		Evaluates the performance of this model on specified dataset.

		@@ -220,6 +222,14 @@ class Model(BaseEstimator):
		List of `dc.trans.Transformer` objects. These transformations
		must have been applied to `dataset` previously. The dataset will
		be untransformed for metric evaluation.
		per_task_metrics: bool, optional
		If true, return computed metric for each task on multitask dataset.
		use_sample_weights: bool, optional (default False)
		If set, use per-sample weights `w`.
		n_classes: int, optional (default None)
		If specified, will use `n_classes` as the number of unique classes
		in `self.dataset`. Note that this argument will be ignored for
		regression metrics.

		Returns
		-------
		@@ -231,7 +241,11 @@ class Model(BaseEstimator):
		separately.
		"""
		evaluator = Evaluator(self, dataset, transformers)
		return evaluator.compute_model_performance(metrics, **kwargs)
		return evaluator.compute_model_performance(
		metrics,
		per_task_metrics=per_task_metrics,
		use_sample_weights=use_sample_weights,
		n_classes=n_classes)

		def get_task_type(self) -> str:
		"""

deepchem/utils/evaluate.py

+3 −3

Original line number	Diff line number	Diff line
		@@ -260,9 +260,9 @@ class Evaluator(object):
		use_sample_weights: bool, optional (default False)
		If set, use per-sample weights `w`.
		n_classes: int, optional (default None)
		If specified, will assume that all `metrics` are classification
		metrics and will use `n_classes` as the number of unique classes
		in `self.dataset`.
		If specified, will use `n_classes` as the number of unique classes
		in `self.dataset`. Note that this argument will be ignored for
		regression metrics.

		Returns
		-------

deepchem/utils/test/test_evaluate.py

+0 −31

Original line number	Diff line number	Diff line
		@@ -83,19 +83,6 @@ def test_evaluate_multiclass_classification_singletask():
		assert multitask_scores["metric-1"] >= 0


		def test_multiclass_classification_singletask():
		"""Test multiclass classification evaluation."""
		X = np.random.rand(100, 5)
		y = np.random.randint(5, size=(100,))
		dataset = dc.data.NumpyDataset(X, y)
		model = dc.models.MultitaskClassifier(1, 5, n_classes=5)
		evaluator = Evaluator(model, dataset, [])
		multitask_scores = evaluator.compute_model_performance(
		dc.metrics.accuracy_score, n_classes=5)
		assert len(multitask_scores) == 1
		assert multitask_scores["metric-1"] >= 0


		def test_multitask_evaluator():
		"""Test evaluation of a multitask metric."""
		n_tasks = 2
		@@ -127,24 +114,6 @@ def test_model_evaluate_dc_metric():
		assert multitask_scores['mae_score'] > 0


		def test_multitask_evaluator():
		"""Test evaluation of a multitask metric."""
		n_tasks = 2
		X = np.random.rand(10, 5)
		y = np.random.rand(10, 2)
		dataset = dc.data.NumpyDataset(X, y)
		model = dc.models.MultitaskRegressor(2, 5)
		evaluator = Evaluator(model, dataset, [])
		metric = dc.metrics.Metric(dc.metrics.mae_score)
		multitask_scores, all_task_scores = evaluator.compute_model_performance(
		metric, per_task_metrics=True)
		assert isinstance(multitask_scores, dict)
		assert len(multitask_scores) == 1
		assert multitask_scores['mae_score'] > 0
		assert isinstance(all_task_scores, dict)
		assert len(multitask_scores) == 1


		def test_multitask_model_evaluate_sklearn():
		"""Test evaluation of a multitask metric."""
		n_tasks = 2

Admin message