Commit b3463f05 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Fixes

parent 63f2ecf3
Loading
Loading
Loading
Loading
+46 −26
Original line number Diff line number Diff line
@@ -4,6 +4,7 @@ import numpy as np
import warnings
import sklearn.metrics
import logging
# TODO: Imported metrics will be removed in a futrue version of DeepCHem
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import recall_score
from sklearn.metrics import r2_score
@@ -108,7 +109,7 @@ def normalize_weight_shape(w, n_samples, n_tasks):
    
    

def normalize_prediction_shape(y, mode="classification", n_classes=None):
def normalize_prediction_shape(y, mode=None, n_classes=None):
  """A utility function to correct the shape of the input array.

  The metric computation classes expect that inputs for classification
@@ -133,9 +134,12 @@ def normalize_prediction_shape(y, mode="classification", n_classes=None):
    must take values from `0` to `n_classes-1` as integers. If
    `mode=="regression"`, `y` is an array of shape `(N,)` or `(N,
    n_tasks)`or `(N, n_tasks, 1)`. In the edge case where `N == 1`,
    `y` may be a scalar.
  mode: str
    Must be either "classification" or "regression".
    `y` may be a scalar. If `mode` is None, then `y` can be of any
    shape and is returned unchanged.
  mode: str, optional (default None)
    If `mode` is "classification" or "regression", attempts to apply
    data transformations. For other modes, performs no transformations
    to data and returns as-is.
  n_classes: int, optional
    If specified use this as the number of classes. Else will try to
    impute it as `n_classes = max(y) + 1` for arrays and as
@@ -149,6 +153,7 @@ def normalize_prediction_shape(y, mode="classification", n_classes=None):
    n_tasks, n_classes)`. If `mode=="regression"`, `y_out` is an array
    of shape `(N, n_tasks)`.
  """
  if mode == "classification":
    if n_classes is None:
      if isinstance(y, np.ndarray):
        # Find number of classes. Note that `y` must have values in
@@ -157,7 +162,6 @@ def normalize_prediction_shape(y, mode="classification", n_classes=None):
      else:
        # scalar case
        n_classes = 2
  if mode == "classification":
    if isinstance(y, np.ndarray):
      if len(y.shape) == 1:
        # y_hot is of shape (N, n_classes)
@@ -201,6 +205,10 @@ def normalize_prediction_shape(y, mode="classification", n_classes=None):
        raise ValueError("y must a float sclar or a ndarray of shape `(N,)` or `(N, n_tasks)` or `(N, n_tasks, 1)` for regression problems.")
      y = np.array(y)
      y_out = np.reshape(y, (1, 1))
  else:
    # If mode isn't classification or regression don't perform any
    # transformations.
    y_out = y
  return y_out
    
def to_one_hot(y, n_classes=2):
@@ -454,7 +462,7 @@ class Metric(object):
               name=None,
               threshold=None,
               mode=None,
               **kwargs):
               compute_energy_metric=None):
    """
    Parameters
    ----------
@@ -464,17 +472,21 @@ class Metric(object):
    task_averager: function, optional
      If not None, should be a function that averages metrics across
      tasks. For example, task_averager=np.mean. If task_averager is
      provided, this task will be inherited as a multitask metric.
    name: str, optional
      provided, this metric will be assumed to be multitask and
      `self.is_multitask` will be set to True. 
    name: str, optional (default None)
      Name of this metric
    threshold: float, optional
    threshold: float, optional (default None)
      Used for binary metrics and is the threshold for the positive
      class
    mode: str, optional
      Must be either classification or regression.
      class.
    mode: str, optional (default None)
      Should usually be "classification" or "regression."
    compute_energy_metric: bool, optional (default None)
      Deprecated metric. Will be removed in a future version of
      DeepChem. Do not use.
    """
    if "compute_energy_metric" in kwargs:
      self.compute_energy_metric = kwargs["compute_energy_metric"]
    if compute_energy_metric is not None:
      self.compute_energy_metric = compute_energy_metric 
      logger.warn("compute_energy_metric is deprecated and will be removed in a future version of DeepChem.")
    else:
      self.compute_energy_metric = False
@@ -483,13 +495,20 @@ class Metric(object):
    self.is_multitask = (self.task_averager is not None)
    if name is None:
      if not self.is_multitask:
        if hasattr(self.metric, '__name__'):
          self.name = self.metric.__name__
        else:
          self.name = "unknown metric"
      else:
        if hasattr(self.metric, '__name__'):
          self.name = self.task_averager.__name__ + "-" + self.metric.__name__
        else:
          self.name = "unknown metric"
    else:
      self.name = name
    self.threshold = threshold
    if mode is None:
      # These are some smart defaults 
      if self.metric.__name__ in [
          "roc_auc_score", "matthews_corrcoef", "recall_score",
          "accuracy_score", "kappa_score", "precision_score",
@@ -502,11 +521,12 @@ class Metric(object):
      ]:
        mode = "regression"
      else:
        raise ValueError("Must specify mode for new metric.")
    assert mode in ["classification", "regression"]
        logger.info("Support for non classification/regression metrics is new. Check your results carefully.")
    # Attempts to set threshold defaults intelligently
    if self.metric.__name__ in [
        "accuracy_score", "balanced_accuracy_score", "recall_score",
        "matthews_corrcoef", "precision_score", "f1_score"
        "matthews_corrcoef", "roc_auc_score", "precision_score",
        "f1_score"
    ] and threshold is None:
      self.threshold = 0.5
    self.mode = mode
+137 −40
Original line number Diff line number Diff line
@@ -14,6 +14,54 @@ from deepchem.metrics import Metric
logger = logging.getLogger(__name__)


def output_statistics(scores, stats_out):
  """Write computed stats to file.

  Statistics are written to specified `stats_out` file.

  Parameters
  ----------
  scores: dict
    Dictionary mapping names of metrics to scores.
  stats_out: str
    Name of file to write scores to.
  """
  logger.warning("output_statistics is deprecated.")
  with open(stats_out, "w") as statsfile:
    statsfile.write(str(scores) + "\n")


def output_predictions(dataset, y_preds, csv_out):
  """Writes predictions to file.

  Writes predictions made on `dataset` to a specified file on
  disk. `dataset.ids` are used to format predictions. The produce CSV file will have format as follows

  | ID          | Task1Name    | Task2Name    |
  | ----------- | ------------ | ------------ |
  | identifer1  | prediction11 | prediction12 |
  | identifer2  | prediction21 | prediction22 |

  Parameters
  ----------
  dataset: dc.data.Dataset
    Dataset on which predictions have been made.
  y_preds: np.ndarray
    Predictions to output
  csv_out: str
    Name of file to write predictions to.
  """
  mol_ids = dataset.ids
  n_tasks = len(dataset.get_task_names())
  y_preds = np.reshape(y_preds, (len(y_preds), n_tasks))
  assert len(y_preds) == len(mol_ids)
  with open(csv_out, "w") as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(["ID"] + dataset.get_task_names())
    for mol_id, y_pred in zip(mol_ids, y_preds):
      csvwriter.writerow([mol_id] + list(y_pred))


def _process_metric_input(metrics):
  """A private helper method which processes metrics correctly.

@@ -41,7 +89,7 @@ def _process_metric_input(metrics):
    `dc.metrics.Metric` objects.
  """
  # Make sure input is a list
  if not len(metrics):
  if not isinstance(metrics, list):
    metrics = [metrics]
  final_metrics = []
  for i, metric in enumerate(metrics):
@@ -51,7 +99,7 @@ def _process_metric_input(metrics):
    # This case checks if input is a function then wraps a
    # dc.metrics.Metric object around it
    elif callable(metric):
      wrap_metric = Metric(metric, name="metric-%d" % i)
      wrap_metric = Metric(metric, name="metric-%d" % (i + 1))
      final_metrics.append(wrap_metric)
    else:
      raise ValueError(
@@ -114,15 +162,29 @@ class Evaluator(object):

  Example
  -------
  Evaluators allow for a model to be evaluated directly on a Metric
  for `sklearn`. Let's do a bit of setup constructing our dataset and
  model.

  >>> import numpy as np
  >>> X = np.random.rand(10, 5)
  >>> y = np.random.rand(10, 1)
  >>> dataset = dc.data.NumpyDataset(X, y)
  >>> model = dc.models.MultitaskRegressor(1, 5)
  >>> transformers = []

  Then you can evaluate this model as follows 
  >>> import sklearn
  >>> evaluator = Evaluator(model, dataset, transformers)
  >>> multitask_scores = evaluator.compute_model_performance(
  ...     sklearn.metrics.mean_absolute_error)

  Evaluators can also be used with `dc.metrics.Metric` objects as well
  in case you want to customize your metric further. 

  >>> evaluator = Evaluator(model, dataset, transformers)
  >>> metric = dc.metrics.Metric(dc.metrics.mae_score)
  >>> multitask_scores = evaluator.compute_model_performance([metric])
  >>> multitask_scores = evaluator.compute_model_performance(metric)
  """

  def __init__(self, model, dataset, transformers):
@@ -131,7 +193,6 @@ class Evaluator(object):
    self.output_transformers = [
        transformer for transformer in transformers if transformer.transform_y
    ]
    self.task_names = dataset.get_task_names()

  def output_statistics(self, scores, stats_out):
    """ Write computed stats to file.
@@ -143,26 +204,35 @@ class Evaluator(object):
    stats_out: str
      Name of file to write scores to.
    """
    logger.warning(
        "Evaluator.output_statistics is deprecated. Please use dc.utils.evaluate.output_statistics instead. This method will be removed in a future version of DeepChem."
    )
    with open(stats_out, "w") as statsfile:
      statsfile.write(str(scores) + "\n")

  def output_predictions(self, y_preds, csv_out):
    """Writes predictions to file.

    Writes predictions made on `self.dataset` to a specified file on
    disk. `self.dataset.ids` are used to format predictions.

    Parameters
    ----------
    y_preds: np.ndarray
      Predictions to output
    csvfile: str
    csv_out: str
      Name of file to write predictions to.
    """
    logger.warning(
        "Evaluator.output_predictions is deprecated. Please use dc.utils.evaluate.output_predictions instead. This method will be removed in a future version of DeepChem."
    )
    mol_ids = self.dataset.ids
    n_tasks = len(self.task_names)
    n_tasks = len(self.dataset.get_task_names())
    y_preds = np.reshape(y_preds, (len(y_preds), n_tasks))
    assert len(y_preds) == len(mol_ids)
    with open(csv_out, "w") as csvfile:
      csvwriter = csv.writer(csvfile)
      csvwriter.writerow(["Compound"] + self.dataset.get_task_names())
      csvwriter.writerow(["ID"] + self.dataset.get_task_names())
      for mol_id, y_pred in zip(mol_ids, y_preds):
        csvwriter.writerow([mol_id] + list(y_pred))

@@ -170,17 +240,29 @@ class Evaluator(object):
                                metrics,
                                csv_out=None,
                                stats_out=None,
                                per_task_metrics=False):
                                per_task_metrics=False,
                                n_classes=None):
    """
    Computes statistics of model on test data and saves results to csv.

    Parameters
    ----------
    metrics: list
      List of dc.metrics.Metric objects
    csv_out: str, optional
    metrics: dc.metrics.Metric/list[dc.metrics.Metric]/function
      The set of metrics provided. This class attempts to do some
      intelligent handling of input. If a single `dc.metrics.Metric`
      object is provided or a list is provided, it will evaluate
      `self.model` on these metrics. If a function is provided, it is
      assumed to be a metric function that this method will attempt to
      wrap in a `dc.metrics.Metric` object. A metric function must
      accept two arguments, `y_true, y_pred` both of which are
      `np.ndarray` objects and return a floating point score.
    n_classes: int, optional (default None)
      If specified, will assume that all `metrics` are classification
      metrics and will use `n_classes` as the number of unique classes
      in `self.dataset`.
    csv_out: str, optional (Deprecated)
      Filename to write CSV of model predictions.
    stats_out: str, optional
    stats_out: str, optional (Deprecated)
      Filename to write computed statistics.
    per_task_metrics: bool, optional
      If true, return computed metric for each task on multitask dataset.
@@ -193,39 +275,35 @@ class Evaluator(object):
      If `per_task_metrics == True`, then returns a second dictionary
      of scores for each task separately.
    """
    if csv_out is not None:
      logger.warning(
          "csv_out is deprecated as an argument and will be removed in a future version of DeepChem. Output is not written to CSV; manually write output instead."
      )
    if stats_out is not None:
      logger.warning(
          "stats_out is deprecated as an argument and will be removed in a future version of DeepChem. Stats output is not written; please manually write output instead"
      )
    # Process input metrics
    metrics = _process_metric_input(metrics)

    y = self.dataset.y
    y = undo_transforms(y, self.output_transformers)
    w = self.dataset.w

    if not len(metrics):
      return {}
    else:
      mode = metrics[0].mode
    y_pred = self.model.predict(self.dataset, self.output_transformers)
    if mode == "classification":
      y_pred_print = np.argmax(y_pred, -1)
    else:
      y_pred_print = y_pred

    multitask_scores = {}
    all_task_scores = {}

    if csv_out is not None:
      logger.info("Saving predictions to %s" % csv_out)
      self.output_predictions(y_pred_print, csv_out)

    # Compute multitask metrics
    for metric in metrics:
      results = metric.compute_metric(
          y, y_pred, w, per_task_metrics=per_task_metrics, n_classes=n_classes)
      if per_task_metrics:
        multitask_scores[metric.name], computed_metrics = metric.compute_metric(
            y, y_pred, w, per_task_metrics=True)
        multitask_scores[metric.name], computed_metrics = results
        all_task_scores[metric.name] = computed_metrics
      else:
        multitask_scores[metric.name] = metric.compute_metric(
            y, y_pred, w, per_task_metrics=False)

    if stats_out is not None:
      logger.info("Saving stats to %s" % stats_out)
      self.output_statistics(multitask_scores, stats_out)
        multitask_scores[metric.name] = results

    if not per_task_metrics:
      return multitask_scores
@@ -247,10 +325,21 @@ class GeneratorEvaluator(object):
  >>> y = np.random.rand(10, 1)
  >>> dataset = dc.data.NumpyDataset(X, y)
  >>> model = dc.models.MultitaskRegressor(1, 5)
  >>> transformers = []
  >>> generator = model.default_generator(dataset, pad_batches=False)
  >>> evaluator = Evaluator(model, generator, transformers)
  >>> multitask_scores = evaluator.compute_model_performance([metric])

  Then you can evaluate this model as follows 

  >>> import sklearn
  >>> evaluator = GeneratorEvaluator(model, generator, transformers)
  >>> multitask_scores = evaluator.compute_model_performance(
  ...     sklearn.metrics.mean_absolute_error)

  Evaluators can also be used with `dc.metrics.Metric` objects as well
  in case you want to customize your metric further. 

  >>> evaluator = GeneratorEvaluator(model, dataset, transformers)
  >>> metric = dc.metrics.Metric(dc.metrics.mae_score)
  >>> multitask_scores = evaluator.compute_model_performance(metric)
  """

  def __init__(self, model, generator, transformers, labels=None, weights=None):
@@ -281,7 +370,10 @@ class GeneratorEvaluator(object):
    if labels is not None and len(labels) != 1:
      raise ValueError("GeneratorEvaluator currently only supports one label")

  def compute_model_performance(self, metrics, per_task_metrics=False):
  def compute_model_performance(self,
                                metrics,
                                per_task_metrics=False,
                                n_classes=None):
    """
    Computes statistics of model on test data and saves results to csv.

@@ -299,6 +391,10 @@ class GeneratorEvaluator(object):
    per_task_metrics: bool, optional
      If true, return computed metric for each task on multitask
      dataset.
    n_classes: int, optional (default None)
      If specified, will assume that all `metrics` are classification
      metrics and will use `n_classes` as the number of unique classes
      in `self.dataset`.

    Returns
    -------
@@ -315,6 +411,7 @@ class GeneratorEvaluator(object):
    w = []

    def generator_closure():
      """This function is used to pull true labels/weights out as we iterate over the generator."""
      if self.label_keys is None:
        weights = None
        # This is a KerasModel.
@@ -350,13 +447,13 @@ class GeneratorEvaluator(object):

    # Compute multitask metrics
    for metric in metrics:
      results = metric.compute_metric(
          y, y_pred, w, per_task_metrics=per_task_metrics)
      if per_task_metrics:
        multitask_scores[metric.name], computed_metrics = metric.compute_metric(
            y, y_pred, w, per_task_metrics=True)
        multitask_scores[metric.name], computed_metrics = results
        all_task_scores[metric.name] = computed_metrics
      else:
        multitask_scores[metric.name] = metric.compute_metric(
            y, y_pred, w, per_task_metrics=False)
        multitask_scores[metric.name] = results

    if not per_task_metrics:
      return multitask_scores
+66 −18
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@
import deepchem as dc
import numpy as np
import unittest
import sklearn
from deepchem.utils.evaluate import Evaluator
from deepchem.utils.evaluate import GeneratorEvaluator

@@ -12,40 +13,72 @@ class TestEvaluator(unittest.TestCase):
    X = np.random.rand(10, 5)
    y = np.random.rand(10, 1)
    self.dataset = dc.data.NumpyDataset(X, y)
    self.model = dc.models.MultitaskRegressor(1, 5)

  def test_evaluator_dc_metric(self):
    """Test an evaluator on a dataset."""
    model = dc.models.MultitaskRegressor(1, 5)
    transformers = []
    evaluator = Evaluator(model, self.dataset, transformers)
    evaluator = Evaluator(self.model, self.dataset, [])
    metric = dc.metrics.Metric(dc.metrics.mae_score)
    multitask_scores = evaluator.compute_model_performance([metric])
    multitask_scores = evaluator.compute_model_performance(metric)
    assert isinstance(multitask_scores, dict)
    assert len(multitask_scores) == 1
    assert multitask_scores['mae_score'] > 0

  def test_evaluator_dc_multi_metric(self):
    """Test an evaluator on a dataset."""
    evaluator = Evaluator(self.model, self.dataset, [])
    metric1 = dc.metrics.Metric(dc.metrics.mae_score)
    metric2 = dc.metrics.Metric(dc.metrics.r2_score)
    multitask_scores = evaluator.compute_model_performance(
      [metric1, metric2])
    assert isinstance(multitask_scores, dict)
    assert len(multitask_scores) == 2
    assert multitask_scores['mae_score'] > 0
    assert "r2_score" in multitask_scores
    
    
  def test_evaluator_sklearn_metric(self):
    """Test an evaluator on a dataset."""
    evaluator = Evaluator(self.model, self.dataset, [])
    multitask_scores = evaluator.compute_model_performance(
      sklearn.metrics.mean_absolute_error)
    assert isinstance(multitask_scores, dict)
    assert len(multitask_scores) == 1
    # Note that since no name as provided, metrics are index by order
    # given.
    assert multitask_scores['metric-1'] > 0

  def test_evaluator_sklearn_multi_metric(self):
    """Test an evaluator on a dataset."""
    evaluator = Evaluator(self.model, self.dataset, [])
    multitask_scores = evaluator.compute_model_performance(
      [sklearn.metrics.mean_absolute_error,
       sklearn.metrics.r2_score])
    assert isinstance(multitask_scores, dict)
    assert len(multitask_scores.keys()) == 2
    # Note that since no name as provided, metrics are index by order
    # given.
    assert multitask_scores['metric-1'] > 0
    assert "metric-2" in multitask_scores

  def test_generator_evaluator_dc_metric_multitask(self):
    """Test generator evaluator on a generator."""
    model = dc.models.MultitaskRegressor(1, 5)
    generator = model.default_generator(
    generator = self.model.default_generator(
      self.dataset, pad_batches=False)
    transformers = []
    evaluator = GeneratorEvaluator(model, generator, transformers)
    evaluator = GeneratorEvaluator(self.model, generator, [])
    metric = dc.metrics.Metric(dc.metrics.mae_score)
    multitask_scores = evaluator.compute_model_performance([metric])
    multitask_scores = evaluator.compute_model_performance(metric)
    assert isinstance(multitask_scores, dict)
    assert len(multitask_scores) == 1
    assert multitask_scores['mae_score'] > 0

  def test_generator_evaluator_dc_metric_multitask_single_point(self):
    """Test generator evaluator on a generator."""
    model = dc.models.MultitaskRegressor(1, 5)
    generator = model.default_generator(
    generator = self.model.default_generator(
      self.dataset, pad_batches=False)
    transformers = []
    evaluator = GeneratorEvaluator(model, generator, transformers)
    evaluator = GeneratorEvaluator(self.model, generator, [])
    metric = dc.metrics.Metric(dc.metrics.mae_score)
    multitask_scores = evaluator.compute_model_performance([metric])
    multitask_scores = evaluator.compute_model_performance(metric)
    assert isinstance(multitask_scores, dict)
    assert len(multitask_scores) == 1
    print("multitask_scores")
@@ -54,11 +87,26 @@ class TestEvaluator(unittest.TestCase):

  def test_evaluator_dc_metric_singletask(self):
    """Test an evaluator on a dataset."""
    model = dc.models.MultitaskRegressor(1, 5)
    transformers = []
    evaluator = Evaluator(model, self.dataset, transformers)
    evaluator = Evaluator(self.model, self.dataset, [])
    metric = dc.metrics.Metric(dc.metrics.mae_score)
    multitask_scores = evaluator.compute_model_performance([metric])
    multitask_scores = evaluator.compute_model_performance(metric)
    assert isinstance(multitask_scores, dict)
    assert len(multitask_scores) == 1
    assert multitask_scores['mae_score'] > 0

  def test_multiclass_classification_singletask(self):
    """Test multiclass classification evaluation."""
    X = np.random.rand(100, 5)
    y = np.random.randint(5, size=(100,))
    dataset = dc.data.NumpyDataset(X, y)
    model = dc.models.MultitaskClassifier(1, 5, n_classes=5)
    evaluator = Evaluator(model, dataset, [])
    multitask_scores = evaluator.compute_model_performance(
      sklearn.metrics.accuracy_score, n_classes=5)
    assert len(multitask_scores) == 1
    assert multitask_scores["metric-1"] >= 0

# TODO: Add a multtiask metrics example
# TODO: Add a multitask per-task metric example
# TODO: Add metrics for images here as a test