Unverified Commit 4b8bc381 authored by Karl Leswing's avatar Karl Leswing Committed by GitHub
Browse files

Merge pull request #1265 from peastman/metric

Fixed incorrect code in metrics 
parents 2817b137 dc4eba99
Loading
Loading
Loading
Loading
+41 −52
Original line number Diff line number Diff line
@@ -3,10 +3,9 @@
import numpy as np
import warnings
from deepchem.utils.save import log
from sklearn.metrics import roc_auc_score
import sklearn.metrics
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
@@ -39,22 +38,29 @@ def from_one_hot(y, axis=1):
  return np.argmax(y, axis=axis)


def compute_roc_auc_scores(y, y_pred):
  """Transforms the results dict into roc-auc-scores and prints scores.
def _ensure_one_hot(y):
  """If neceessary, convert class labels to one-hot encoding."""
  if len(y.shape) == 1:
    return to_one_hot(y)
  return y

  Parameters
  ----------
  results: dict
  task_types: dict
    dict mapping task names to output type. Each output type must be either
    "classification" or "regression".
  """
  try:
    score = roc_auc_score(y, y_pred)
  except ValueError:
    warnings.warn("ROC AUC score calculation failed.")
    score = 0.5
  return score

def _ensure_class_labels(y):
  """If necessary, convert one-hot encoding to class labels."""
  if len(y.shape) == 2:
    return from_one_hot(y)
  return y


def roc_auc_score(y, y_pred):
  """Area under the receiver operating characteristic curve."""
  return sklearn.metrics.roc_auc_score(_ensure_one_hot(y), y_pred)


def accuracy_score(y, y_pred):
  y = _ensure_class_labels(y)
  y_pred = _ensure_class_labels(y_pred)
  return sklearn.metrics.accuracy_score(y, y_pred)


def balanced_accuracy_score(y, y_pred):
@@ -74,6 +80,7 @@ def pearson_r2_score(y, y_pred):

def prc_auc_score(y, y_pred):
  """Compute area under precision-recall curve"""
  y = _ensure_one_hot(y)
  assert y_pred.shape == y.shape
  assert y_pred.shape[1] == 2
  precision, recall, _ = precision_recall_curve(y[:, 1], y_pred[:, 1])
@@ -112,13 +119,14 @@ def kappa_score(y_true, y_pred):
  yt = np.asarray(y_true, dtype=int)
  yp = np.asarray(y_pred, dtype=int)
  assert np.array_equal(
      np.unique(yt), [0,
                      1]), ('Class labels must be binary: %s' % np.unique(yt))
      np.unique(yt),
      [0, 1]), ('Class labels must be binary: %s' % np.unique(yt))
  observed_agreement = np.true_divide(
      np.count_nonzero(np.equal(yt, yp)), len(yt))
  expected_agreement = np.true_divide(
      np.count_nonzero(yt == 1) * np.count_nonzero(yp == 1) +
      np.count_nonzero(yt == 0) * np.count_nonzero(yp == 0), len(yt)**2)
      np.count_nonzero(yt == 0) * np.count_nonzero(yp == 0),
      len(yt)**2)
  kappa = np.true_divide(observed_agreement - expected_agreement,
                         1.0 - expected_agreement)
  return kappa
@@ -205,25 +213,20 @@ class Metric(object):
    -------
    A numpy nd.array containing metric values for each task.
    """
    if len(y_true.shape) > 1:
      n_samples, n_tasks = y_true.shape[0], y_true.shape[1]
    else:
      n_samples, n_tasks = y_true.shape[0], 1
    if self.mode == "classification":
      y_pred = np.reshape(y_pred, (n_samples, n_tasks, n_classes))
    n_samples = y_true.shape[0]
    expected_dims = (3 if self.mode == "classification" else 2)
    if len(y_pred.shape) < expected_dims:
      n_tasks = 1
      y_true = np.expand_dims(y_true, 1)
      y_pred = np.expand_dims(y_pred, 1)
    else:
      y_pred = np.reshape(y_pred, (n_samples, n_tasks))
    y_true = np.reshape(y_true, (n_samples, n_tasks))
      n_tasks = y_pred.shape[1]
    if w is None or len(w) == 0:
      w = np.ones_like(y_true)
    assert y_true.shape[0] == y_pred.shape[0] == w.shape[0]
      w = np.ones((n_samples, n_tasks))
    computed_metrics = []
    for task in range(n_tasks):
      y_task = y_true[:, task]
      if self.mode == "regression":
      y_pred_task = y_pred[:, task]
      else:
        y_pred_task = y_pred[:, task, :]
      w_task = w[:, task]

      metric_value = self.compute_singletask_metric(y_task, y_pred_task, w_task)
@@ -271,26 +274,12 @@ class Metric(object):
    # If there are no nonzero examples, metric is ill-defined.
    if not y_true.size:
      return np.nan

    y_true = np.reshape(y_true, (n_samples,))
    if self.mode == "classification":
      n_classes = y_pred.shape[-1]
      # TODO(rbharath): This has been a major source of bugs. Is there a more
      # robust characterization of which metrics require class-probs and which
      # don't?
      if "roc_auc_score" in self.name or "prc_auc_score" in self.name:
        y_true = to_one_hot(y_true).astype(int)
        y_pred = np.reshape(y_pred, (n_samples, n_classes))
      else:
        y_true = y_true.astype(int)
        # Reshape to handle 1-d edge cases
        y_pred = np.reshape(y_pred, (n_samples, n_classes))
        y_pred = from_one_hot(y_pred)
    else:
      y_pred = np.reshape(y_pred, (n_samples,))

    if self.threshold is not None:
      y_pred = np.greater(y_pred, threshold)
    if len(y_true.shape) == 0:
      y_true = np.expand_dims(y_true, 0)
    if len(y_pred.shape) == 0:
      y_pred = np.expand_dims(y_pred, 0)
    try:
      metric_value = self.metric(y_true, y_pred)
    except (AssertionError, ValueError) as e:
+7 −18
Original line number Diff line number Diff line
@@ -178,6 +178,8 @@ class GeneratorEvaluator(object):
    self.weights = weights
    if len(self.label_keys) != len(self.output_keys):
      raise ValueError("Must have same number of labels and outputs")
    if len(self.label_keys) != 1:
      raise ValueError("GeneratorEvaluator currently only supports one label")

  def compute_model_performance(self, metrics, per_task_metrics=False):
    """
@@ -196,30 +198,17 @@ class GeneratorEvaluator(object):

    def generator_closure():
      for feed_dict in self.generator:
        labels = []
        for layer in self.label_keys:
          labels.append(feed_dict[layer])
          del feed_dict[layer]
        for weight in self.weights:
          w.append(feed_dict[weight])
          del feed_dict[weight]
        y.append(np.array(labels))
        y.append(feed_dict[self.label_keys[0]])
        if len(self.weights) > 0:
          w.append(feed_dict[self.weights[0]])
        yield feed_dict

    if not len(metrics):
      return {}
    else:
      mode = metrics[0].mode
    if mode == "classification":
      y_pred = self.model.predict_on_generator(generator_closure())
      y = np.transpose(np.array(y), axes=[0, 2, 1, 3])
      y = np.reshape(y, newshape=(-1, self.n_tasks, self.n_classes))
      y = from_one_hot(y, axis=-1)
    else:
    y_pred = self.model.predict_on_generator(generator_closure())
      y = np.transpose(np.array(y), axes=[0, 2, 1, 3])
      y = np.reshape(y, newshape=(-1, self.n_tasks))
      y_pred = np.reshape(y_pred, newshape=(-1, self.n_tasks))
    y = np.concatenate(y, axis=0)
    multitask_scores = {}
    all_task_scores = {}

+44 −90
Original line number Diff line number Diff line
@@ -5,7 +5,7 @@ import numpy as np
from deepchem.data import NumpyDataset
from deepchem.data.datasets import Databag
from deepchem.models.tensorgraph.layers import Dense, ReduceMean, SoftMax, SoftMaxCrossEntropy
from deepchem.models.tensorgraph.layers import Feature, Label
from deepchem.models.tensorgraph.layers import Feature, Label, Reshape
from deepchem.models.tensorgraph.layers import ReduceSquareDifference
from nose.tools import assert_true
from flaky import flaky
@@ -17,6 +17,8 @@ class TestGeneratorEvaluator(TestCase):
  def test_compute_model_performance_multitask_classifier(self):
    n_data_points = 20
    n_features = 1
    n_tasks = 2
    n_classes = 2

    X = np.ones(shape=(n_data_points // 2, n_features)) * -1
    X1 = np.ones(shape=(n_data_points // 2, n_features))
@@ -25,43 +27,29 @@ class TestGeneratorEvaluator(TestCase):
    class_0 = np.array([[1.0, 0.0] for x in range(int(n_data_points / 2))])
    y1 = np.concatenate((class_0, class_1))
    y2 = np.concatenate((class_1, class_0))
    X = NumpyDataset(X)
    ys = [NumpyDataset(y1), NumpyDataset(y2)]

    databag = Databag()
    y = np.stack([y1, y2], axis=1)
    dataset = NumpyDataset(X, y)

    features = Feature(shape=(None, n_features))
    databag.add_dataset(features, X)

    outputs = []
    entropies = []
    labels = []
    for i in range(2):
      label = Label(shape=(None, 2))
      labels.append(label)
      dense = Dense(out_channels=2, in_layers=[features])
      output = SoftMax(in_layers=[dense])
      smce = SoftMaxCrossEntropy(in_layers=[label, dense])

      entropies.append(smce)
      outputs.append(output)
      databag.add_dataset(label, ys[i])

    total_loss = ReduceMean(in_layers=entropies)
    label = Label(shape=(None, n_tasks, n_classes))
    dense = Dense(out_channels=n_tasks * n_classes, in_layers=[features])
    logits = Reshape(shape=(None, n_tasks, n_classes), in_layers=dense)
    output = SoftMax(in_layers=[logits])
    smce = SoftMaxCrossEntropy(in_layers=[label, logits])
    total_loss = ReduceMean(in_layers=smce)

    tg = dc.models.TensorGraph(learning_rate=0.01, batch_size=n_data_points)
    for output in outputs:
    tg.add_output(output)
    tg.set_loss(total_loss)

    tg.fit_generator(
        databag.iterbatches(
            epochs=1000, batch_size=tg.batch_size, pad_batches=True))
    tg.fit(dataset, nb_epoch=1000)
    metric = dc.metrics.Metric(
        dc.metrics.roc_auc_score, np.mean, mode="classification")

    scores = tg.evaluate_generator(
        databag.iterbatches(), [metric], labels=labels, per_task_metrics=True)
        tg.default_generator(dataset), [metric],
        labels=[label],
        per_task_metrics=True)
    scores = list(scores[1].values())
    # Loosening atol to see if tests stop failing sporadically
    assert_true(np.all(np.isclose(scores, [1.0, 1.0], atol=0.50)))
@@ -75,44 +63,28 @@ class TestGeneratorEvaluator(TestCase):
    X = np.concatenate((X, X1))
    class_1 = np.array([[0.0, 1.0] for x in range(int(n_data_points / 2))])
    class_0 = np.array([[1.0, 0.0] for x in range(int(n_data_points / 2))])
    y1 = np.concatenate((class_0, class_1))
    X = NumpyDataset(X)
    ys = [NumpyDataset(y1)]

    databag = Databag()
    y = np.concatenate((class_0, class_1))
    dataset = NumpyDataset(X, y)

    features = Feature(shape=(None, n_features))
    databag.add_dataset(features, X)

    outputs = []
    entropies = []
    labels = []
    for i in range(1):
    label = Label(shape=(None, 2))
      labels.append(label)
    dense = Dense(out_channels=2, in_layers=[features])
    output = SoftMax(in_layers=[dense])
    smce = SoftMaxCrossEntropy(in_layers=[label, dense])

      entropies.append(smce)
      outputs.append(output)
      databag.add_dataset(label, ys[i])

    total_loss = ReduceMean(in_layers=entropies)
    total_loss = ReduceMean(in_layers=smce)

    tg = dc.models.TensorGraph(learning_rate=0.1)
    for output in outputs:
    tg.add_output(output)
    tg.set_loss(total_loss)

    tg.fit_generator(
        databag.iterbatches(
            epochs=1000, batch_size=tg.batch_size, pad_batches=True))
    tg.fit(dataset, nb_epoch=1000)
    metric = dc.metrics.Metric(
        dc.metrics.roc_auc_score, np.mean, mode="classification")

    scores = tg.evaluate_generator(
        databag.iterbatches(), [metric], labels=labels, per_task_metrics=True)
        tg.default_generator(dataset), [metric],
        labels=[label],
        per_task_metrics=True)
    scores = list(scores[1].values())
    assert_true(np.isclose(scores, [1.0], atol=0.05))

@@ -120,52 +92,34 @@ class TestGeneratorEvaluator(TestCase):
    random_seed = 42
    n_data_points = 20
    n_features = 2
    n_tasks = 2
    np.random.seed(seed=random_seed)

    X = np.random.rand(n_data_points, n_features)
    y1 = np.expand_dims(np.array([0.5 for x in range(n_data_points)]), axis=-1)
    y2 = np.expand_dims(np.array([-0.5 for x in range(n_data_points)]), axis=-1)
    X = NumpyDataset(X)
    ys = [NumpyDataset(y1), NumpyDataset(y2)]

    databag = Databag()
    y1 = np.array([0.5 for x in range(n_data_points)])
    y2 = np.array([-0.5 for x in range(n_data_points)])
    y = np.stack([y1, y2], axis=1)
    dataset = NumpyDataset(X, y)

    features = Feature(shape=(None, n_features))
    databag.add_dataset(features, X)

    outputs = []
    losses = []
    labels = []
    for i in range(2):
      label = Label(shape=(None, 1))
      dense = Dense(out_channels=1, in_layers=[features])
    label = Label(shape=(None, n_tasks))
    dense = Dense(out_channels=n_tasks, in_layers=[features])
    loss = ReduceSquareDifference(in_layers=[dense, label])

      outputs.append(dense)
      losses.append(loss)
      labels.append(label)
      databag.add_dataset(label, ys[i])

    total_loss = ReduceMean(in_layers=losses)

    tg = dc.models.TensorGraph(
        mode="regression",
        batch_size=20,
        random_seed=random_seed,
        learning_rate=0.1)
    for output in outputs:
      tg.add_output(output)
    tg.set_loss(total_loss)
    tg = dc.models.TensorGraph(random_seed=random_seed, learning_rate=0.1)
    tg.add_output(dense)
    tg.set_loss(loss)

    tg.fit_generator(
        databag.iterbatches(
            epochs=1000, batch_size=tg.batch_size, pad_batches=True))
    tg.fit(dataset, nb_epoch=1000)
    metric = [
        dc.metrics.Metric(
            dc.metrics.mean_absolute_error, np.mean, mode="regression"),
    ]
    scores = tg.evaluate_generator(
        databag.iterbatches(), metric, labels=labels, per_task_metrics=True)
        tg.default_generator(dataset),
        metric,
        labels=[label],
        per_task_metrics=True)
    scores = list(scores[1].values())
    assert_true(np.all(np.isclose(scores, [0.0, 0.0], atol=1.0)))

+1 −1
Original line number Diff line number Diff line
@@ -108,7 +108,7 @@ def compute_scores(optimize):
  print()
  print('Cross entropy loss:', np.mean(losses))
  print('Prediction accuracy:', accuracy_score(y_true, y_pred > 0.5))
  print('ROC AUC:', dc.metrics.compute_roc_auc_scores(y_true, y_pred))
  print('ROC AUC:', dc.metrics.roc_auc_scores(y_true, y_pred))
  print()