Commit 3b0f9836 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

changes

parent 6b467aa2
Loading
Loading
Loading
Loading
+30 −3
Original line number Diff line number Diff line
@@ -72,6 +72,13 @@ def normalize_weight_shape(w, n_samples, n_tasks):
    The number of tasks. If `w` is 2d ndarray, then we should have
    `w.shape[1] == n_tasks`.

  Examples
  --------
  >>> import numpy as np
  >>> w_out = dc.metrics.normalize_weight_shape(None, n_samples, n_tasks)
  >>> (w_out == np.ones((n_samples, n_tasks))).all()
  True

  Returns
  -------
  w_out: np.ndarray
@@ -302,7 +309,20 @@ def accuracy_score(y, y_pred):


def balanced_accuracy_score(y, y_pred):
  """Computes balanced accuracy score."""
  """Computes balanced accuracy score.

  Parameters
  ----------
  y: np.ndarray
    Of shape `(N_samples,)`
  y_pred: np.ndarray
    Of shape `(N_samples,)`

  Returns
  -------
  score: float
    The balanced_accuracy. A number between 0 and 1.
  """
  num_positive = float(np.count_nonzero(y))
  num_negative = float(len(y) - num_positive)
  pos_weight = num_negative / num_positive
@@ -322,8 +342,15 @@ def jaccard_index(y, y_pred):

  Parameters
  ----------
  y: ground truth array
  y_pred: predicted array
  y: np.ndarray
    ground truth array
  y_pred: np.ndarray
    predicted array

  Returns
  -------
  score: float
    The jaccard index. A number between 0 and 1.
  """
  return jaccard_score(y, y_pred)

+58 −56
Original line number Diff line number Diff line
@@ -7,9 +7,7 @@ import unittest
from deepchem import metrics


class MetricsTest(unittest.TestCase):

  def test_kappa_score(self):
def test_kappa_score():
  y_true = [1, 0, 1, 0]
  y_pred = [0.8, 0.2, 0.3, 0.4]  # [1, 0, 0, 0] with 0.5 threshold
  kappa = dc.metrics.kappa_score(y_true, np.greater(y_pred, 0.5))
@@ -17,16 +15,17 @@ class MetricsTest(unittest.TestCase):
  expected_agreement = ((2 * 1) + (2 * 3)) / 4.0**2
  expected_kappa = np.true_divide(observed_agreement - expected_agreement,
                                  1.0 - expected_agreement)
    self.assertAlmostEqual(kappa, expected_kappa)
  np.testing.assert_almost_equal(kappa, expected_kappa)


  def test_one_sample(self):
def test_one_sample():
  """Test that the metrics won't raise error even in an extreme condition
  where there is only one sample with w > 0.
  """
  np.random.seed(123)
  n_samples = 2
    y_true = np.array([0, 0])
    y_pred = np.random.rand(n_samples, 2)
  y_true = np.random.randint(2, size=(n_samples,))
  y_pred = np.random.randint(2, size=(n_samples,))
  w = np.array([0, 1])
  all_metrics = [
      dc.metrics.Metric(dc.metrics.recall_score),
@@ -35,9 +34,11 @@ class MetricsTest(unittest.TestCase):
  ]
  for metric in all_metrics:
    score = metric.compute_singletask_metric(y_true, y_pred, w)
      self.assertTrue(np.isnan(score) or score == 0)
    print("score")
    print(score)

  def test_r2_score(self):

def test_r2_score():
  """Test that R^2 metric passes basic sanity tests"""
  np.random.seed(123)
  n_samples = 10
@@ -48,7 +49,8 @@ class MetricsTest(unittest.TestCase):
      dc.metrics.r2_score(y_true, y_pred),
      regression_metric.compute_metric(y_true, y_pred))

  def test_bedroc_score(self):

def test_bedroc_score():
  """Test BEDROC."""
  num_actives = 20
  num_total = 400
@@ -61,7 +63,7 @@ class MetricsTest(unittest.TestCase):
  y_pred_best = dc.metrics.to_one_hot(
      np.concatenate([y_true_actives, y_true_inactives]))
  best_score = dc.metrics.bedroc_score(y_true, y_pred_best)
    self.assertAlmostEqual(best_score, 1.0)
  np.testing.assert_almost_equal(best_score, 1.0)

  # Worst score case
  worst_pred_actives = np.zeros(num_actives)
@@ -69,4 +71,4 @@ class MetricsTest(unittest.TestCase):
  y_pred_worst = dc.metrics.to_one_hot(
      np.concatenate([worst_pred_actives, worst_pred_inactives]))
  worst_score = dc.metrics.bedroc_score(y_true, y_pred_worst)
    self.assertAlmostEqual(worst_score, 0.0, 4)
  np.testing.assert_almost_equal(worst_score, 0.0, 4)
+6 −3
Original line number Diff line number Diff line
@@ -657,11 +657,14 @@ class GraphConvModel(KerasModel):
  """Graph Convolutional Models.

  This class implements the graph convolutional model from the
  following paper:
  following paper [1]_.


  Duvenaud, David K., et al. "Convolutional networks on graphs for learning molecular fingerprints." Advances in neural information processing systems. 2015.

  References
  ----------
  .. [1] Duvenaud, David K., et al. "Convolutional networks on graphs for
  learning molecular fingerprints." Advances in neural information processing
  systems. 2015.
  """

  def __init__(self,
+2 −1
Original line number Diff line number Diff line
@@ -5,7 +5,6 @@ import pytest
import scipy

import deepchem as dc
import tensorflow as tf
from deepchem.data import NumpyDataset
from deepchem.models import GraphConvModel, DAGModel, WeaveModel, MPNNModel
from deepchem.molnet import load_bace_classification, load_delaney
@@ -190,6 +189,7 @@ class TestGraphModels(unittest.TestCase):

  @pytest.mark.slow
  def test_dag_regression_model(self):
    import tensorflow as tf
    np.random.seed(1234)
    tf.random.set_seed(1234)
    tasks, dataset, transformers, metric = self.get_dataset(
@@ -214,6 +214,7 @@ class TestGraphModels(unittest.TestCase):

  @pytest.mark.slow
  def test_dag_regression_uncertainty(self):
    import tensorflow as tf
    np.random.seed(1234)
    tf.random.set_seed(1234)
    tasks, dataset, transformers, metric = self.get_dataset(
+326 −247
Original line number Diff line number Diff line
@@ -7,16 +7,7 @@ from deepchem.utils.evaluate import Evaluator
from deepchem.utils.evaluate import GeneratorEvaluator


class TestEvaluator(unittest.TestCase):

  def setUp(self):
    """Perform common setup for tests."""
    X = np.random.rand(10, 5)
    y = np.random.rand(10, 1)
    self.dataset = dc.data.NumpyDataset(X, y)
    self.model = dc.models.MultitaskRegressor(1, 5)

  def test_multiclass_threshold_predictions(self):
def test_multiclass_threshold_predictions():
  """Check prediction thresholding works correctly."""
  # Construct a random class probability matrix
  y = np.random.rand(10, 5)
@@ -26,7 +17,8 @@ class TestEvaluator(unittest.TestCase):
  assert y_out.shape == (10,)
  assert np.allclose(y_out, np.argmax(y, axis=1))

  def test_binary_threshold_predictions(self):

def test_binary_threshold_predictions():
  """Check prediction thresholding works correctly."""
  # Construct a random class probability matrix
  y = np.random.rand(10, 2)
@@ -34,113 +26,24 @@ class TestEvaluator(unittest.TestCase):
  y = y / y_sums[:, None]
  y_out = dc.metrics.threshold_predictions(y, threshold=0.3)
  assert y_out.shape == (10,)
    assert np.allclose(y_out, np.where(y[:, 1] >= 0.3, np.ones(10),
                                       np.zeros(10)))

  def test_evaluator_dc_metric(self):
    """Test an evaluator on a dataset."""
    evaluator = Evaluator(self.model, self.dataset, [])
    metric = dc.metrics.Metric(dc.metrics.mae_score)
    multitask_scores = evaluator.compute_model_performance(metric)
    assert isinstance(multitask_scores, dict)
    assert len(multitask_scores) == 1
    assert multitask_scores['mae_score'] > 0

  def test_model_evaluate_dc_metric(self):
    """Test a model evaluate on a dataset."""
    metric = dc.metrics.Metric(dc.metrics.mae_score)
    multitask_scores = self.model.evaluate(self.dataset, metric, [])
    assert isinstance(multitask_scores, dict)
    assert len(multitask_scores) == 1
    assert multitask_scores['mae_score'] > 0

  def test_evaluator_dc_multi_metric(self):
    """Test an evaluator on a dataset."""
    evaluator = Evaluator(self.model, self.dataset, [])
    metric1 = dc.metrics.Metric(dc.metrics.mae_score)
    metric2 = dc.metrics.Metric(dc.metrics.r2_score)
    multitask_scores = evaluator.compute_model_performance([metric1, metric2])
    assert isinstance(multitask_scores, dict)
    assert len(multitask_scores) == 2
    assert multitask_scores['mae_score'] > 0
    assert "r2_score" in multitask_scores
  assert np.allclose(y_out, np.where(y[:, 1] >= 0.3, np.ones(10), np.zeros(10)))

  def test_model_evaluate_dc_multi_metric(self):
    """Test an evaluator on a dataset."""
    metric1 = dc.metrics.Metric(dc.metrics.mae_score)
    metric2 = dc.metrics.Metric(dc.metrics.r2_score)
    multitask_scores = self.model.evaluate(self.dataset, [metric1, metric2])
    assert isinstance(multitask_scores, dict)
    assert len(multitask_scores) == 2
    assert multitask_scores['mae_score'] > 0
    assert "r2_score" in multitask_scores

  def test_evaluator_sklearn_metric(self):
    """Test an evaluator on a dataset."""
    evaluator = Evaluator(self.model, self.dataset, [])
    multitask_scores = evaluator.compute_model_performance(
        sklearn.metrics.mean_absolute_error)
    assert isinstance(multitask_scores, dict)
    assert len(multitask_scores) == 1
    # Note that since no name as provided, metrics are index by order
    # given.
    assert multitask_scores['metric-1'] > 0

  def test_model_evaluate_sklearn_metric(self):
    """Test a model evaluate on a dataset."""
    multitask_scores = self.model.evaluate(self.dataset,
                                           sklearn.metrics.mean_absolute_error)
    assert isinstance(multitask_scores, dict)
    assert len(multitask_scores) == 1
    # Note that since no name as provided, metrics are index by order
    # given.
    assert multitask_scores['metric-1'] > 0

  def test_evaluator_sklearn_multi_metric(self):
    """Test an evaluator on a dataset."""
    evaluator = Evaluator(self.model, self.dataset, [])
    multitask_scores = evaluator.compute_model_performance(
        [sklearn.metrics.mean_absolute_error, sklearn.metrics.r2_score])
    assert isinstance(multitask_scores, dict)
    assert len(multitask_scores.keys()) == 2
    # Note that since no name as provided, metrics are index by order
    # given.
    assert multitask_scores['metric-1'] > 0
    assert "metric-2" in multitask_scores

  def test_model_evaluate_sklearn_multi_metric(self):
def test_evaluator_dc_metric():
  """Test an evaluator on a dataset."""
    multitask_scores = self.model.evaluate(
        self.dataset,
        [sklearn.metrics.mean_absolute_error, sklearn.metrics.r2_score])
    assert isinstance(multitask_scores, dict)
    assert len(multitask_scores.keys()) == 2
    # Note that since no name as provided, metrics are index by order
    # given.
    assert multitask_scores['metric-1'] > 0
    assert "metric-2" in multitask_scores

  def test_generator_evaluator_dc_metric_multitask(self):
    """Test generator evaluator on a generator."""
    generator = self.model.default_generator(self.dataset, pad_batches=False)
    evaluator = GeneratorEvaluator(self.model, generator, [])
  X = np.random.rand(10, 5)
  y = np.random.rand(10, 1)
  dataset = dc.data.NumpyDataset(X, y)
  model = dc.models.MultitaskRegressor(1, 5)
  evaluator = Evaluator(model, dataset, [])
  metric = dc.metrics.Metric(dc.metrics.mae_score)
  multitask_scores = evaluator.compute_model_performance(metric)
  assert isinstance(multitask_scores, dict)
  assert len(multitask_scores) == 1
  assert multitask_scores['mae_score'] > 0

  def test_generator_evaluator_dc_metric_multitask_single_point(self):
    """Test generator evaluator on a generator."""
    generator = self.model.default_generator(self.dataset, pad_batches=False)
    evaluator = GeneratorEvaluator(self.model, generator, [])
    metric = dc.metrics.Metric(dc.metrics.mae_score)
    multitask_scores = evaluator.compute_model_performance(metric)
    assert isinstance(multitask_scores, dict)
    assert len(multitask_scores) == 1
    assert multitask_scores['mae_score'] > 0

  def test_multiclass_classification_singletask(self):
def test_multiclass_classification_singletask():
  """Test multiclass classification evaluation."""
  X = np.random.rand(100, 5)
  y = np.random.randint(5, size=(100,))
@@ -148,11 +51,12 @@ class TestEvaluator(unittest.TestCase):
  model = dc.models.MultitaskClassifier(1, 5, n_classes=5)
  evaluator = Evaluator(model, dataset, [])
  multitask_scores = evaluator.compute_model_performance(
        sklearn.metrics.roc_auc_score, n_classes=5)
      dc.metrics.roc_auc_score, n_classes=5)
  assert len(multitask_scores) == 1
  assert multitask_scores["metric-1"] >= 0

  def test_sklearn_multiclass_classification_singletask(self):

def test_sklearn_multiclass_classification_singletask():
  """Test multiclass classification evaluation."""
  X = np.random.rand(100, 5)
  y = np.random.randint(5, size=(100,))
@@ -162,42 +66,44 @@ class TestEvaluator(unittest.TestCase):
  model.fit(dataset)
  evaluator = Evaluator(model, dataset, [])
  multitask_scores = evaluator.compute_model_performance(
        sklearn.metrics.roc_auc_score, n_classes=5)
      dc.metrics.roc_auc_score, n_classes=5)
  assert len(multitask_scores) == 1
  assert multitask_scores["metric-1"] >= 0

  def test_evaluate_multiclass_classification_singletask(self):

def test_evaluate_multiclass_classification_singletask():
  """Test multiclass classification evaluation."""
  X = np.random.rand(100, 5)
  y = np.random.randint(5, size=(100,))
  dataset = dc.data.NumpyDataset(X, y)
  model = dc.models.MultitaskClassifier(1, 5, n_classes=5)
  multitask_scores = model.evaluate(
        dataset, sklearn.metrics.roc_auc_score, n_classes=5)
      dataset, dc.metrics.roc_auc_score, n_classes=5)
  assert len(multitask_scores) == 1
  assert multitask_scores["metric-1"] >= 0

  def test_multiclass_classification_singletask(self):

def test_multiclass_classification_singletask():
  """Test multiclass classification evaluation."""
  X = np.random.rand(100, 5)
  y = np.random.randint(5, size=(100,))
  dataset = dc.data.NumpyDataset(X, y)
  model = dc.models.MultitaskClassifier(1, 5, n_classes=5)
    # TODO: Fix this case with correct thresholding
  evaluator = Evaluator(model, dataset, [])
  multitask_scores = evaluator.compute_model_performance(
        sklearn.metrics.accuracy_score, n_classes=5, threshold=True)
      dc.metrics.accuracy_score, n_classes=5, threshold=True)
  assert len(multitask_scores) == 1
  assert multitask_scores["metric-1"] >= 0

  def test_multitask_evaluator(self):

def test_multitask_evaluator():
  """Test evaluation of a multitask metric."""
  n_tasks = 2
  X = np.random.rand(10, 5)
  y = np.random.rand(10, 2, 1)
  dataset = dc.data.NumpyDataset(X, y)
  model = dc.models.MultitaskRegressor(2, 5)
    evaluator = Evaluator(self.model, self.dataset, [])
  evaluator = Evaluator(model, dataset, [])
  metric = dc.metrics.Metric(dc.metrics.mae_score)
  multitask_scores, all_task_scores = evaluator.compute_model_performance(
      metric, per_task_metrics=True)
@@ -207,7 +113,21 @@ class TestEvaluator(unittest.TestCase):
  assert isinstance(all_task_scores, dict)
  assert len(multitask_scores) == 1

  def test_multitask_evaluator(self):

def test_model_evaluate_dc_metric():
  """Test a model evaluate on a dataset."""
  X = np.random.rand(10, 5)
  y = np.random.rand(10, 1)
  dataset = dc.data.NumpyDataset(X, y)
  model = dc.models.MultitaskRegressor(1, 5)
  metric = dc.metrics.Metric(dc.metrics.mae_score)
  multitask_scores = model.evaluate(dataset, metric, [])
  assert isinstance(multitask_scores, dict)
  assert len(multitask_scores) == 1
  assert multitask_scores['mae_score'] > 0


def test_multitask_evaluator():
  """Test evaluation of a multitask metric."""
  n_tasks = 2
  X = np.random.rand(10, 5)
@@ -224,7 +144,8 @@ class TestEvaluator(unittest.TestCase):
  assert isinstance(all_task_scores, dict)
  assert len(multitask_scores) == 1

  def test_multitask_model_evaluate_sklearn(self):

def test_multitask_model_evaluate_sklearn():
  """Test evaluation of a multitask metric."""
  n_tasks = 2
  X = np.random.rand(10, 5)
@@ -233,14 +154,15 @@ class TestEvaluator(unittest.TestCase):
  model = dc.models.MultitaskRegressor(2, 5)
  evaluator = Evaluator(model, dataset, [])
  multitask_scores, all_task_scores = evaluator.compute_model_performance(
        sklearn.metrics.mean_absolute_error, per_task_metrics=True)
      dc.metrics.mean_absolute_error, per_task_metrics=True)
  assert isinstance(multitask_scores, dict)
  assert len(multitask_scores) == 1
  assert multitask_scores['metric-1'] > 0
  assert isinstance(all_task_scores, dict)
  assert len(multitask_scores) == 1

  def test_multitask_model_evaluate(self):

def test_multitask_model_evaluate():
  """Test evaluation of a multitask metric."""
  n_tasks = 2
  X = np.random.rand(10, 5)
@@ -248,9 +170,166 @@ class TestEvaluator(unittest.TestCase):
  dataset = dc.data.NumpyDataset(X, y)
  model = dc.models.MultitaskRegressor(2, 5)
  multitask_scores, all_task_scores = model.evaluate(
        dataset, sklearn.metrics.mean_absolute_error, per_task_metrics=True)
      dataset, dc.metrics.mean_absolute_error, per_task_metrics=True)
  assert isinstance(multitask_scores, dict)
  assert len(multitask_scores) == 1
  assert multitask_scores["metric-1"] > 0
  assert isinstance(all_task_scores, dict)


def test_evaluator_dc_multi_metric():
  """Test an evaluator on a dataset."""
  X = np.random.rand(10, 5)
  y = np.random.rand(10, 1)
  dataset = dc.data.NumpyDataset(X, y)
  model = dc.models.MultitaskRegressor(1, 5)
  evaluator = Evaluator(model, dataset, [])
  metric1 = dc.metrics.Metric(dc.metrics.mae_score)
  metric2 = dc.metrics.Metric(dc.metrics.r2_score)
  multitask_scores = evaluator.compute_model_performance([metric1, metric2])
  assert isinstance(multitask_scores, dict)
  assert len(multitask_scores) == 2
  assert multitask_scores['mae_score'] > 0
  assert "r2_score" in multitask_scores


def test_model_evaluate_dc_multi_metric():
  """Test an evaluator on a dataset."""
  X = np.random.rand(10, 5)
  y = np.random.rand(10, 1)
  dataset = dc.data.NumpyDataset(X, y)
  model = dc.models.MultitaskRegressor(1, 5)
  metric1 = dc.metrics.Metric(dc.metrics.mae_score)
  metric2 = dc.metrics.Metric(dc.metrics.r2_score)
  multitask_scores = model.evaluate(dataset, [metric1, metric2])
  assert isinstance(multitask_scores, dict)
  assert len(multitask_scores) == 2
  assert multitask_scores['mae_score'] > 0
  assert "r2_score" in multitask_scores


def test_generator_evaluator_dc_metric_multitask_single_point():
  """Test generator evaluator on a generator."""
  X = np.random.rand(10, 5)
  y = np.random.rand(10, 1)
  dataset = dc.data.NumpyDataset(X, y)
  model = dc.models.MultitaskRegressor(1, 5)
  generator = model.default_generator(dataset, pad_batches=False)
  evaluator = GeneratorEvaluator(model, generator, [])
  metric = dc.metrics.Metric(dc.metrics.mae_score)
  multitask_scores = evaluator.compute_model_performance(metric)
  assert isinstance(multitask_scores, dict)
  assert len(multitask_scores) == 1
  assert multitask_scores['mae_score'] > 0
  assert len(multitask_scores) == 1


def test_evaluator_sklearn_metric():
  """Test an evaluator on a dataset."""
  X = np.random.rand(10, 5)
  y = np.random.rand(10, 1)
  dataset = dc.data.NumpyDataset(X, y)
  model = dc.models.MultitaskRegressor(1, 5)
  evaluator = Evaluator(model, dataset, [])
  multitask_scores = evaluator.compute_model_performance(
      dc.metrics.mean_absolute_error)
  assert isinstance(multitask_scores, dict)
  assert len(multitask_scores) == 1
  # Note that since no name as provided, metrics are index by order
  # given.
  assert multitask_scores['metric-1'] > 0


def test_generator_evaluator_dc_metric_multitask():
  """Test generator evaluator on a generator."""
  X = np.random.rand(10, 5)
  y = np.random.rand(10, 1)
  dataset = dc.data.NumpyDataset(X, y)
  model = dc.models.MultitaskRegressor(1, 5)
  generator = model.default_generator(dataset, pad_batches=False)
  evaluator = GeneratorEvaluator(model, generator, [])
  metric = dc.metrics.Metric(dc.metrics.mae_score)
  multitask_scores = evaluator.compute_model_performance(metric)
  assert isinstance(multitask_scores, dict)
  assert len(multitask_scores) == 1
  assert multitask_scores['mae_score'] > 0


def test_model_evaluate_sklearn_metric():
  """Test a model evaluate on a dataset."""
  X = np.random.rand(10, 5)
  y = np.random.rand(10, 1)
  dataset = dc.data.NumpyDataset(X, y)
  model = dc.models.MultitaskRegressor(1, 5)
  multitask_scores = model.evaluate(dataset, dc.metrics.mean_absolute_error)
  assert isinstance(multitask_scores, dict)
  assert len(multitask_scores) == 1
  # Note that since no name as provided, metrics are index by order
  # given.
  assert multitask_scores['metric-1'] > 0


def test_evaluator_sklearn_multi_metric():
  """Test an evaluator on a dataset."""
  X = np.random.rand(10, 5)
  y = np.random.rand(10, 1)
  dataset = dc.data.NumpyDataset(X, y)
  model = dc.models.MultitaskRegressor(1, 5)
  evaluator = Evaluator(model, dataset, [])
  multitask_scores = evaluator.compute_model_performance(
      [dc.metrics.mean_absolute_error, dc.metrics.r2_score])
  assert isinstance(multitask_scores, dict)
  assert len(multitask_scores.keys()) == 2
  # Note that since no name as provided, metrics are index by order
  # given.
  assert multitask_scores['metric-1'] > 0
  assert "metric-2" in multitask_scores


def test_model_evaluate_sklearn_multi_metric():
  """Test an evaluator on a dataset."""
  X = np.random.rand(10, 5)
  y = np.random.rand(10, 1)
  dataset = dc.data.NumpyDataset(X, y)
  model = dc.models.MultitaskRegressor(1, 5)
  multitask_scores = model.evaluate(
      dataset, [dc.metrics.mean_absolute_error, dc.metrics.r2_score])
  assert isinstance(multitask_scores, dict)
  assert len(multitask_scores.keys()) == 2
  # Note that since no name as provided, metrics are index by order
  # given.
  assert multitask_scores['metric-1'] > 0
  assert "metric-2" in multitask_scores


def test_gc_binary_classification():
  """Test multiclass classification evaluation."""
  smiles = ["C", "CC"]
  featurizer = dc.feat.ConvMolFeaturizer()
  X = featurizer.featurize(smiles)
  y = np.random.randint(2, size=(len(smiles),))
  dataset = dc.data.NumpyDataset(X, y)
  model = dc.models.GraphConvModel(1, mode="classification")
  # TODO: Fix this case with correct thresholding
  evaluator = Evaluator(model, dataset, [])
  multitask_scores = evaluator.compute_model_performance(
      dc.metrics.accuracy_score, n_classes=2, threshold=True)
  assert len(multitask_scores) == 1
  assert multitask_scores["metric-1"] >= 0


def test_gc_multiclass_classification():
  """Test multiclass classification evaluation."""
  np.random.seed(123)
  smiles = ["C", "CC"]
  featurizer = dc.feat.ConvMolFeaturizer()
  X = featurizer.featurize(smiles)
  y = np.random.randint(5, size=(len(smiles),))
  dataset = dc.data.NumpyDataset(X, y)
  model = dc.models.GraphConvModel(1, mode="classification")
  # TODO: Fix this case with correct thresholding
  evaluator = Evaluator(model, dataset, [])
  multitask_scores = evaluator.compute_model_performance(
      dc.metrics.accuracy_score, n_classes=5, threshold=True)
  assert len(multitask_scores) == 1
  assert multitask_scores["metric-1"] >= 0