Commit 2fac2a50 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Changes

parent a67b080f
Loading
Loading
Loading
Loading
+49 −44
Original line number Diff line number Diff line
@@ -6,6 +6,7 @@ import sklearn.metrics
import logging
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import recall_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
@@ -505,47 +506,50 @@ def mae_score(y_true, y_pred):
  return mean_absolute_error(y_true, y_pred)


def kappa_score(y_true, y_pred):
  """Calculate Cohen's kappa for classification tasks.

  See https://en.wikipedia.org/wiki/Cohen%27s_kappa

  Note that this implementation of Cohen's kappa expects binary labels.

  Parameters
  ----------
  y_true: np.ndarray
    Numpy array containing true values of shape `(N,)`
  y_pred: np.ndarray
    Numpy array containing predicted values of shape `(N,)`

  Returns
  -------
  kappa: np.ndarray
    Numpy array containing kappa for each classification task.

  Raises
  ------
  AssertionError: If y_true and y_pred are not the same size, or if
  class labels are not in [0, 1].
  """
  assert len(y_true) == len(y_pred), 'Number of examples does not match.'
  yt = np.asarray(y_true, dtype=int)
  yp = np.asarray(y_pred, dtype=int)
  if not set(np.unique(yt)).issubset(set([0, 1])):
    raise ValueError("Class labels must be binary 0, 1")
  assert np.array_equal(
      np.unique(yt),
      [0, 1]), ('Class labels must be binary: %s' % np.unique(yt))
  observed_agreement = np.true_divide(
      np.count_nonzero(np.equal(yt, yp)), len(yt))
  expected_agreement = np.true_divide(
      np.count_nonzero(yt == 1) * np.count_nonzero(yp == 1) +
      np.count_nonzero(yt == 0) * np.count_nonzero(yp == 0),
      len(yt)**2)
  kappa = np.true_divide(observed_agreement - expected_agreement,
                         1.0 - expected_agreement)
  return kappa
# kappa_score is an alias for `sklearn.metrics.cohen_kappa_score`
kappa_score = cohen_kappa_score

#def kappa_score(y_true, y_pred):
#  """Calculate Cohen's kappa for classification tasks.
#
#  See https://en.wikipedia.org/wiki/Cohen%27s_kappa
#
#  Note that this implementation of Cohen's kappa expects binary labels.
#
#  Parameters
#  ----------
#  y_true: np.ndarray
#    Numpy array containing true values of shape `(N,)`
#  y_pred: np.ndarray
#    Numpy array containing predicted values of shape `(N,)`
#
#  Returns
#  -------
#  kappa: np.ndarray
#    Numpy array containing kappa for each classification task.
#
#  Raises
#  ------
#  AssertionError: If y_true and y_pred are not the same size, or if
#  class labels are not in [0, 1].
#  """
#  assert len(y_true) == len(y_pred), 'Number of examples does not match.'
#  yt = np.asarray(y_true, dtype=int)
#  yp = np.asarray(y_pred, dtype=int)
#  if not set(np.unique(yt)).issubset(set([0, 1])):
#    raise ValueError("Class labels must be binary 0, 1")
#  assert np.array_equal(
#      np.unique(yt),
#      [0, 1]), ('Class labels must be binary: %s' % np.unique(yt))
#  observed_agreement = np.true_divide(
#      np.count_nonzero(np.equal(yt, yp)), len(yt))
#  expected_agreement = np.true_divide(
#      np.count_nonzero(yt == 1) * np.count_nonzero(yp == 1) +
#      np.count_nonzero(yt == 0) * np.count_nonzero(yp == 0),
#      len(yt)**2)
#  kappa = np.true_divide(observed_agreement - expected_agreement,
#                         1.0 - expected_agreement)
#  return kappa


def bedroc_score(y_true, y_pred, alpha=20.0):
@@ -705,6 +709,7 @@ class Metric(object):
          "recall_score",
          "accuracy_score",
          "kappa_score",
          "cohen_kappa_score",
          "precision_score",
          "balanced_accuracy_score",
          "prc_auc_score",
@@ -719,9 +724,9 @@ class Metric(object):
        # behavior
        if classification_handling_mode is None:
          if self.metric.__name__ in [
              "matthews_corrcoef", "kappa_score", "balanced_accuracy_score",
              "recall_score", "jaccard_score", "jaccard_index", "pixel_error",
              "f1_score"
              "matthews_corrcoef", "cohen_kappa_score", "kappa_score",
              "balanced_accuracy_score", "recall_score", "jaccard_score",
              "jaccard_index", "pixel_error", "f1_score"
          ]:
            classification_handling_mode = "threshold"
          elif self.metric.__name__ in [
+16 −2
Original line number Diff line number Diff line
@@ -188,7 +188,9 @@ class Model(BaseEstimator):
               dataset: Dataset,
               metrics: List[Metric],
               transformers: List[Transformer] = [],
               per_task_metrics: bool = False):
               per_task_metrics: bool = False,
               use_sample_weights: bool = False,
               n_classes: int = 2):
    """
    Evaluates the performance of this model on specified dataset.

@@ -220,6 +222,14 @@ class Model(BaseEstimator):
      List of `dc.trans.Transformer` objects. These transformations
      must have been applied to `dataset` previously. The dataset will
      be untransformed for metric evaluation.
    per_task_metrics: bool, optional
      If true, return computed metric for each task on multitask dataset.
    use_sample_weights: bool, optional (default False)
      If set, use per-sample weights `w`.
    n_classes: int, optional (default None)
      If specified, will use `n_classes` as the number of unique classes
      in `self.dataset`. Note that this argument will be ignored for
      regression metrics.

    Returns
    -------
@@ -231,7 +241,11 @@ class Model(BaseEstimator):
      separately.
    """
    evaluator = Evaluator(self, dataset, transformers)
    return evaluator.compute_model_performance(metrics, **kwargs)
    return evaluator.compute_model_performance(
        metrics,
        per_task_metrics=per_task_metrics,
        use_sample_weights=use_sample_weights,
        n_classes=n_classes)

  def get_task_type(self) -> str:
    """
+3 −3
Original line number Diff line number Diff line
@@ -260,9 +260,9 @@ class Evaluator(object):
    use_sample_weights: bool, optional (default False)
      If set, use per-sample weights `w`.
    n_classes: int, optional (default None)
      If specified, will assume that all `metrics` are classification
      metrics and will use `n_classes` as the number of unique classes
      in `self.dataset`.
      If specified, will use `n_classes` as the number of unique classes
      in `self.dataset`. Note that this argument will be ignored for
      regression metrics.

    Returns
    -------
+0 −31
Original line number Diff line number Diff line
@@ -83,19 +83,6 @@ def test_evaluate_multiclass_classification_singletask():
  assert multitask_scores["metric-1"] >= 0


def test_multiclass_classification_singletask():
  """Test multiclass classification evaluation."""
  X = np.random.rand(100, 5)
  y = np.random.randint(5, size=(100,))
  dataset = dc.data.NumpyDataset(X, y)
  model = dc.models.MultitaskClassifier(1, 5, n_classes=5)
  evaluator = Evaluator(model, dataset, [])
  multitask_scores = evaluator.compute_model_performance(
      dc.metrics.accuracy_score, n_classes=5)
  assert len(multitask_scores) == 1
  assert multitask_scores["metric-1"] >= 0


def test_multitask_evaluator():
  """Test evaluation of a multitask metric."""
  n_tasks = 2
@@ -127,24 +114,6 @@ def test_model_evaluate_dc_metric():
  assert multitask_scores['mae_score'] > 0


def test_multitask_evaluator():
  """Test evaluation of a multitask metric."""
  n_tasks = 2
  X = np.random.rand(10, 5)
  y = np.random.rand(10, 2)
  dataset = dc.data.NumpyDataset(X, y)
  model = dc.models.MultitaskRegressor(2, 5)
  evaluator = Evaluator(model, dataset, [])
  metric = dc.metrics.Metric(dc.metrics.mae_score)
  multitask_scores, all_task_scores = evaluator.compute_model_performance(
      metric, per_task_metrics=True)
  assert isinstance(multitask_scores, dict)
  assert len(multitask_scores) == 1
  assert multitask_scores['mae_score'] > 0
  assert isinstance(all_task_scores, dict)
  assert len(multitask_scores) == 1


def test_multitask_model_evaluate_sklearn():
  """Test evaluation of a multitask metric."""
  n_tasks = 2