Commit 63f2ecf3 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Examples

parent 03351e0b
Loading
Loading
Loading
Loading
+0 −4
Original line number Diff line number Diff line
@@ -198,10 +198,6 @@ def normalize_prediction_shape(y, mode="classification", n_classes=None):
      try:
        y = float(y)
      except TypeError:
        #################
        print("y")
        print(y)
        #################
        raise ValueError("y must a float sclar or a ndarray of shape `(N,)` or `(N, n_tasks)` or `(N, n_tasks, 1)` for regression problems.")
      y = np.array(y)
      y_out = np.reshape(y, (1, 1))
+30 −10
Original line number Diff line number Diff line
@@ -32,6 +32,11 @@ def undo_transforms(y, transformers):
  transformers: list[dc.trans.Transformer]
    List of transformations which have already been applied to `y` in the
    order specifed.

  Returns
  -------
  y_out: np.ndarray
    The array with all transformations reversed.
  """
  # Note that transformers have to be undone in reversed order
  for transformer in reversed(transformers):
@@ -50,9 +55,9 @@ def undo_grad_transforms(grad, tasks, transformers):
def get_grad_statistics(dataset):
  """Computes and returns statistics of a dataset

  This function assumes that the first task of a dataset holds the energy for
  an input system, and that the remaining tasks holds the gradient for the
  system.
  This function assumes that the first task of a dataset holds the
  energy for an input system, and that the remaining tasks holds the
  gradient for the system.
  """
  if len(dataset) == 0:
    return None, None, None, None
@@ -68,13 +73,28 @@ def get_grad_statistics(dataset):
class Transformer(object):
  """Abstract base class for different data transformation techniques.

  `Transformer` objects are used to transform `Dataset` objects in ways that
  are useful to machine learning. Transformations might process the data to
  make learning easier (say by normalizing), or may implement techniques such
  as data augmentation.

  Note that you can never instantiate a `Transformer` class directly. You will
  want to use one of the concrete subclasses.
  A transformer is an object that applies a transformation to a given
  dataset. Think of a transformation as a mathematical operation which
  makes the source dataset more amenable to learning. For example, one
  transformer could normalize the features for a dataset (ensuring
  they have zero mean and unit standard deviation). Another
  transformer could for example threshold values in a dataset so that
  values outside a given range are truncated. Yet another transformer
  could act as a data augmentation routine, generating multiple
  different images from each source datapoint (a transformation need
  not necessarily be one to one).

  Transformers are designed to be chained, since data pipelines often
  chain multiple different transformations to a dataset. Transformers
  are also designed to be scalable and can be applied to 
  large `dc.data.Dataset` objects. Not that Transformers are not
  usually thread-safe so you will have to be careful in processing
  very large datasets.

  This class is an abstract superclass that isn't meant to be directly
  instantiated. Instead, you will want to instantiate one of the
  subclasses of this class inorder to perform concrete
  transformations.
  """
  # Hack to allow for easy unpickling:
  # http://stefaanlippens.net/pickleproblem
+71 −34
Original line number Diff line number Diff line
@@ -9,10 +9,57 @@ import pandas as pd
import sklearn
from deepchem.trans import undo_transforms
from deepchem.metrics import from_one_hot
from deepchem.metrics import Metric

logger = logging.getLogger(__name__)


def _process_metric_input(metrics):
  """A private helper method which processes metrics correctly.

  Metrics can be input as `dc.metrics.Metric` objects, lists of
  `dc.metrics.Metric` objects, or as raw metric functions or lists of
  raw metric functions. Metric functions are functions which accept
  two arguments `y_true, y_pred` both of which must be `np.ndarray`
  objects and return a float value. This functions normalizes these
  different types of inputs to type `list[dc.metrics.Metric]` object
  for ease of later processing.

  Note that raw metric functions which don't have names attached will
  simply be named "metric-#" where # is their position in the provided
  metric list. For example, "metric-1" or "metric-7"

  Parameters
  ----------
  metrics: dc.metrics.Metric/list[dc.metrics.Metric]/metric function/ list[metric function]
    Input metrics to process.

  Returns
  -------
  final_metrics: list[dc.metrics.Metric]
    Converts all input metrics and outputs a list of
    `dc.metrics.Metric` objects.
  """
  # Make sure input is a list
  if not len(metrics):
    metrics = [metrics]
  final_metrics = []
  for i, metric in enumerate(metrics):
    # Ensure that metric is wrapped in a list.
    if isinstance(metric, Metric):
      final_metrics.append(metric)
    # This case checks if input is a function then wraps a
    # dc.metrics.Metric object around it
    elif callable(metric):
      wrap_metric = Metric(metric, name="metric-%d" % i)
      final_metrics.append(wrap_metric)
    else:
      raise ValueError(
          "metrics must be one of metric function / dc.metrics.Metric object / list of dc.metrics.Metric or metric functions."
      )
  return final_metrics


def relative_difference(x, y):
  """Compute the relative difference between x and y

@@ -100,8 +147,7 @@ class Evaluator(object):
      statsfile.write(str(scores) + "\n")

  def output_predictions(self, y_preds, csv_out):
    """
    Writes predictions to file.
    """Writes predictions to file.

    Parameters
    ----------
@@ -156,13 +202,6 @@ class Evaluator(object):
    else:
      mode = metrics[0].mode
    y_pred = self.model.predict(self.dataset, self.output_transformers)
    #########################################
    #print("y.shape")
    #print(y.shape)
    #print("y_pred.shape")
    #print(y_pred.shape)
    #assert 0 == 1
    #########################################
    if mode == "classification":
      y_pred_print = np.argmax(y_pred, -1)
    else:
@@ -248,10 +287,18 @@ class GeneratorEvaluator(object):

    Parameters
    ----------
    metrics: list
      List of dc.metrics.Metric objects
    metrics: dc.metrics.Metric/list[dc.metrics.Metric]/function
      The set of metrics provided. This class attempts to do some
      intelligent handling of input. If a single `dc.metrics.Metric`
      object is provided or a list is provided, it will evaluate
      `self.model` on these metrics. If a function is provided, it is
      assumed to be a metric function that this method will attempt to
      wrap in a `dc.metrics.Metric` object. A metric function must
      accept two arguments, `y_true, y_pred` both of which are
      `np.ndarray` objects and return a floating point score.
    per_task_metrics: bool, optional
      If true, return computed metric for each task on multitask dataset.
      If true, return computed metric for each task on multitask
      dataset.

    Returns
    -------
@@ -261,6 +308,9 @@ class GeneratorEvaluator(object):
      If `per_task_metrics == True`, then returns a second dictionary
      of scores for each task separately.
    """
    metrics = _process_metric_input(metrics)

    # We use y/w to aggregate labels/weights across generator.
    y = []
    w = []

@@ -284,42 +334,29 @@ class GeneratorEvaluator(object):
            w.append(weights[0])
          yield (inputs, labels, weights)

    if not len(metrics):
      return {}
    else:
      mode = metrics[0].mode
    # Process predictions and populate y/w lists
    y_pred = self.model.predict_on_generator(generator_closure())
    #y = np.concatenate(y, axis=0)

    # Combine labels/weights
    y = np.concatenate(y, axis=0)
    w = np.concatenate(w, axis=0)

    multitask_scores = {}
    all_task_scores = {}

    # Undo data transformations.
    y = undo_transforms(y, self.output_transformers)
    y_pred = undo_transforms(y_pred, self.output_transformers)
    #if len(w) != 0:
    #  w = np.array(w)
    #  if np.prod(w.shape) == y.shape[0]:
    #    w = np.reshape(w, newshape=(y.shape[0], 1))
    #  else:
    #    w = np.reshape(w, newshape=y.shape)

    # Compute multitask metrics
    #n_classes = y.shape[-1]
    for metric in metrics:
      if per_task_metrics:
        multitask_scores[metric.name], computed_metrics = metric.compute_metric(
            #y, y_pred, w, per_task_metrics=True, n_classes=n_classes)
            y,
            y_pred,
            w,
            per_task_metrics=True)
            y, y_pred, w, per_task_metrics=True)
        all_task_scores[metric.name] = computed_metrics
      else:
        multitask_scores[metric.name] = metric.compute_metric(
            #y, y_pred, w, per_task_metrics=False, n_classes=n_classes)
            y,
            y_pred,
            w,
            per_task_metrics=False)
            y, y_pred, w, per_task_metrics=False)

    if not per_task_metrics:
      return multitask_scores
+35 −39
Original line number Diff line number Diff line
@@ -7,14 +7,17 @@ from deepchem.utils.evaluate import GeneratorEvaluator

class TestEvaluator(unittest.TestCase):

  def test_evaluator_dc_metric(self):
    """Test an evaluator on a dataset."""
  def setUp(self):
    """Perform common setup for tests."""
    X = np.random.rand(10, 5)
    y = np.random.rand(10, 1)
    dataset = dc.data.NumpyDataset(X, y)
    self.dataset = dc.data.NumpyDataset(X, y)

  def test_evaluator_dc_metric(self):
    """Test an evaluator on a dataset."""
    model = dc.models.MultitaskRegressor(1, 5)
    transformers = []
    evaluator = Evaluator(model, dataset, transformers)
    evaluator = Evaluator(model, self.dataset, transformers)
    metric = dc.metrics.Metric(dc.metrics.mae_score)
    multitask_scores = evaluator.compute_model_performance([metric])
    assert isinstance(multitask_scores, dict)
@@ -22,12 +25,10 @@ class TestEvaluator(unittest.TestCase):
    assert multitask_scores['mae_score'] > 0

  def test_generator_evaluator_dc_metric_multitask(self):
    """Test generator evaluator on a dataset."""
    X = np.random.rand(10, 5)
    y = np.random.rand(10, 3)
    dataset = dc.data.NumpyDataset(X, y)
    """Test generator evaluator on a generator."""
    model = dc.models.MultitaskRegressor(1, 5)
    generator = model.default_generator(dataset, pad_batches=False)
    generator = model.default_generator(
      self.dataset, pad_batches=False)
    transformers = []
    evaluator = GeneratorEvaluator(model, generator, transformers)
    metric = dc.metrics.Metric(dc.metrics.mae_score)
@@ -36,33 +37,28 @@ class TestEvaluator(unittest.TestCase):
    assert len(multitask_scores) == 1
    assert multitask_scores['mae_score'] > 0

#  def test_generator_evaluator_dc_metric_multitask_single_point(self):
#    """Test generator evaluator on a dataset."""
#    X = np.random.rand(1, 5)
#    y = np.random.rand(1, 3)
#    dataset = dc.data.NumpyDataset(X, y)
#    model = dc.models.MultitaskRegressor(1, 5)
#    generator = model.default_generator(dataset, pad_batches=False)
#    transformers = []
#    evaluator = GeneratorEvaluator(model, generator, transformers)
#    metric = dc.metrics.Metric(dc.metrics.mae_score)
#    multitask_scores = evaluator.compute_model_performance([metric])
#    assert isinstance(multitask_scores, dict)
#    assert len(multitask_scores) == 1
#    print("multitask_scores")
#    print(multitask_scores)
#    assert multitask_scores['mae_score'] > 0
#
#  def test_evaluator_dc_metric_singletask(self):
#    """Test an evaluator on a dataset."""
#    X = np.random.rand(10, 5)
#    y = np.random.rand(10)
#    dataset = dc.data.NumpyDataset(X, y)
#    model = dc.models.MultitaskRegressor(1, 5)
#    transformers = []
#    evaluator = Evaluator(model, dataset, transformers)
#    metric = dc.metrics.Metric(dc.metrics.mae_score)
#    multitask_scores = evaluator.compute_model_performance([metric])
#    assert isinstance(multitask_scores, dict)
#    assert len(multitask_scores) == 1
#    assert multitask_scores['mae_score'] > 0
  def test_generator_evaluator_dc_metric_multitask_single_point(self):
    """Test generator evaluator on a generator."""
    model = dc.models.MultitaskRegressor(1, 5)
    generator = model.default_generator(
      self.dataset, pad_batches=False)
    transformers = []
    evaluator = GeneratorEvaluator(model, generator, transformers)
    metric = dc.metrics.Metric(dc.metrics.mae_score)
    multitask_scores = evaluator.compute_model_performance([metric])
    assert isinstance(multitask_scores, dict)
    assert len(multitask_scores) == 1
    print("multitask_scores")
    print(multitask_scores)
    assert multitask_scores['mae_score'] > 0

  def test_evaluator_dc_metric_singletask(self):
    """Test an evaluator on a dataset."""
    model = dc.models.MultitaskRegressor(1, 5)
    transformers = []
    evaluator = Evaluator(model, self.dataset, transformers)
    metric = dc.metrics.Metric(dc.metrics.mae_score)
    multitask_scores = evaluator.compute_model_performance([metric])
    assert isinstance(multitask_scores, dict)
    assert len(multitask_scores) == 1
    assert multitask_scores['mae_score'] > 0