Commit afad4301 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

change

parent af2db874
Loading
Loading
Loading
Loading
+37 −3
Original line number Diff line number Diff line
@@ -1655,6 +1655,36 @@ class DiskDataset(Dataset):
    return np.array(
        load_from_disk(os.path.join(self.data_dir, row['ids'])), dtype=object)

  def get_shard_y(self, i):
    """Retrieves the labels for the i-th shard from disk.

    Parameters
    ----------
    i: int
      Shard index for shard to retrieve labels from
    """

    if self._cached_shards is not None and self._cached_shards[i] is not None:
      return self._cached_shards[i].y
    row = self.metadata_df.iloc[i]
    return np.array(
        load_from_disk(os.path.join(self.data_dir, row['y'])), dtype=object)

  def get_shard_w(self, i):
    """Retrieves the weights for the i-th shard from disk.

    Parameters
    ----------
    i: int
      Shard index for shard to retrieve weights from
    """

    if self._cached_shards is not None and self._cached_shards[i] is not None:
      return self._cached_shards[i].w
    row = self.metadata_df.iloc[i]
    return np.array(
        load_from_disk(os.path.join(self.data_dir, row['w'])), dtype=object)

  def add_shard(self, X, y, w, ids):
    """Adds a data shard."""
    metadata_rows = self.metadata_df.values.tolist()
@@ -1758,9 +1788,12 @@ class DiskDataset(Dataset):
  @property
  def y(self):
    """Get the y vector for this dataset as a single numpy array."""
    if len(self) == 0:
      return np.array([])
    ys = []
    one_dimensional = False
    for (_, y_b, _, _) in self.itershards():
    for i in range(self.get_number_shards()):
      y_b = self.get_shard_y(i)
      ys.append(y_b)
      if len(y_b.shape) == 1:
        one_dimensional = True
@@ -1774,8 +1807,9 @@ class DiskDataset(Dataset):
    """Get the weight vector for this dataset as a single numpy array."""
    ws = []
    one_dimensional = False
    for (_, _, w_b, _) in self.itershards():
      ws.append(np.array(w_b))
    for i in range(self.get_number_shards()):
      w_b = self.get_shard_w(i)
      ws.append(w_b)
      if len(w_b.shape) == 1:
        one_dimensional = True
    if not one_dimensional:
+30 −0
Original line number Diff line number Diff line
import numpy as np
import deepchem as dc


def test_y_property():
  """Test that dataset.y works."""
  num_datapoints = 10
  num_features = 10
  num_tasks = 1
  X = np.random.rand(num_datapoints, num_features)
  y = np.random.randint(2, size=(num_datapoints, num_tasks))
  w = np.ones((num_datapoints, num_tasks))
  ids = np.array(["id"] * num_datapoints)
  dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids)
  y_out = dataset.y
  np.testing.assert_array_equal(y, y_out)


def test_w_property():
  """Test that dataset.y works."""
  num_datapoints = 10
  num_features = 10
  num_tasks = 1
  X = np.random.rand(num_datapoints, num_features)
  y = np.random.randint(2, size=(num_datapoints, num_tasks))
  w = np.ones((num_datapoints, num_tasks))
  ids = np.array(["id"] * num_datapoints)
  dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids)
  w_out = dataset.w
  np.testing.assert_array_equal(w, w_out)
+148 −0
Original line number Diff line number Diff line
import numpy as np
import unittest
import deepchem as dc
import itertools
import os


class TestBalancingTransformer(unittest.TestCase):
  """
  Test top-level API for transformer objects.
  """

  def test_binary_1d(self):
    """Test balancing transformer on single-task dataset without explicit task dimension."""
    n_samples = 20
    n_features = 3
    n_classes = 2
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.randint(n_classes, size=(n_samples,))
    w = np.ones((n_samples,))
    dataset = dc.data.NumpyDataset(X, y, w)

    balancing_transformer = dc.trans.BalancingTransformer(
        transform_w=True, dataset=dataset)
    dataset = balancing_transformer.transform(dataset)
    X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
    # Check ids are unchanged.
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt
    # Check X is unchanged since this is a w transformer
    np.testing.assert_allclose(X, X_t)
    # Check y is unchanged since this is a w transformer
    np.testing.assert_allclose(y, y_t)
    y_task = y_t
    w_task = w_t
    w_orig_task = w
    # Assert that entries with zero weight retain zero weight
    np.testing.assert_allclose(w_task[w_orig_task == 0],
                               np.zeros_like(w_task[w_orig_task == 0]))
    # Check that sum of 0s equals sum of 1s in transformed for each task
    assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))

  def test_binary_singletask(self):
    """Test balancing transformer on single-task dataset."""
    n_samples = 20
    n_features = 3
    n_tasks = 1
    n_classes = 2
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.randint(n_classes, size=(n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
    dataset = dc.data.NumpyDataset(X, y, w)

    balancing_transformer = dc.trans.BalancingTransformer(
        transform_w=True, dataset=dataset)
    dataset = balancing_transformer.transform(dataset)
    X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
    # Check ids are unchanged.
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt
    # Check X is unchanged since this is a w transformer
    np.testing.assert_allclose(X, X_t)
    # Check y is unchanged since this is a w transformer
    np.testing.assert_allclose(y, y_t)
    for ind, task in enumerate(dataset.get_task_names()):
      y_task = y_t[:, ind]
      w_task = w_t[:, ind]
      w_orig_task = w[:, ind]
      # Assert that entries with zero weight retain zero weight
      np.testing.assert_allclose(w_task[w_orig_task == 0],
                                 np.zeros_like(w_task[w_orig_task == 0]))
      # Check that sum of 0s equals sum of 1s in transformed for each task
      assert np.isclose(
          np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))

  def test_binary_multitask(self):
    """Test balancing transformer on multitask dataset."""
    n_samples = 10
    n_features = 3
    n_tasks = 5
    n_classes = 2
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.randint(n_classes, size=(n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
    multitask_dataset = dc.data.NumpyDataset(X, y, w)
    balancing_transformer = dc.trans.BalancingTransformer(
        transform_w=True, dataset=multitask_dataset)
    #X, y, w, ids = (multitask_dataset.X, multitask_dataset.y,
    #                multitask_dataset.w, multitask_dataset.ids)
    multitask_dataset = balancing_transformer.transform(multitask_dataset)
    X_t, y_t, w_t, ids_t = (multitask_dataset.X, multitask_dataset.y,
                            multitask_dataset.w, multitask_dataset.ids)
    # Check ids are unchanged.
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt
    # Check X is unchanged since this is a w transformer
    np.testing.assert_allclose(X, X_t)
    # Check y is unchanged since this is a w transformer
    np.testing.assert_allclose(y, y_t)
    for ind, task in enumerate(multitask_dataset.get_task_names()):
      y_task = y_t[:, ind]
      w_task = w_t[:, ind]
      w_orig_task = w[:, ind]
      # Assert that entries with zero weight retain zero weight
      np.testing.assert_allclose(w_task[w_orig_task == 0],
                                 np.zeros_like(w_task[w_orig_task == 0]))
      # Check that sum of 0s equals sum of 1s in transformed for each task
      assert np.isclose(
          np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))

  def test_multiclass_singletask(self):
    """Test balancing transformer on single-task dataset."""
    n_samples = 50
    n_features = 3
    n_tasks = 1
    n_classes = 5
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.randint(n_classes, size=(n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
    dataset = dc.data.NumpyDataset(X, y, w)

    balancing_transformer = dc.trans.BalancingTransformer(
        transform_w=True, dataset=dataset)
    dataset = balancing_transformer.transform(dataset)
    X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
    # Check ids are unchanged.
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt
    # Check X is unchanged since this is a w transformer
    np.testing.assert_allclose(X, X_t)
    # Check y is unchanged since this is a w transformer
    np.testing.assert_allclose(y, y_t)
    for ind, task in enumerate(dataset.get_task_names()):
      y_task = y_t[:, ind]
      w_task = w_t[:, ind]
      w_orig_task = w[:, ind]
      # Check that sum of 0s equals sum of 1s in transformed for each task
      for i, j in itertools.product(range(n_classes), range(n_classes)):
        if i == j:
          continue
        assert np.isclose(
            np.sum(w_task[y_task == i]), np.sum(w_task[y_task == j]))
+0 −88
Original line number Diff line number Diff line
@@ -18,35 +18,6 @@ import tensorflow as tf
import scipy.ndimage


def load_classification_data():
  """Loads classification data from example.csv"""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["outcome"]
  task_type = "classification"
  input_file = os.path.join(current_dir,
                            "../../models/tests/example_classification.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)


def load_multitask_data():
  """Load example multitask data."""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = [
      "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7",
      "task8", "task9", "task10", "task11", "task12", "task13", "task14",
      "task15", "task16"
  ]
  input_file = os.path.join(current_dir,
                            "../../models/tests/multitask_example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)


def load_solubility_data():
  """Loads solubility dataset"""
  current_dir = os.path.dirname(os.path.abspath(__file__))
@@ -549,65 +520,6 @@ class TestTransformers(unittest.TestCase):
    # Check that untransform does the right thing.
    np.testing.assert_allclose(power_transformer.untransform(y_t), y)

  def test_singletask_balancing_transformer(self):
    """Test balancing transformer on single-task dataset."""

    classification_dataset = load_classification_data()
    balancing_transformer = dc.trans.BalancingTransformer(
        transform_w=True, dataset=classification_dataset)
    X, y, w, ids = (classification_dataset.X, classification_dataset.y,
                    classification_dataset.w, classification_dataset.ids)
    classification_dataset = balancing_transformer.transform(
        classification_dataset)
    X_t, y_t, w_t, ids_t = (classification_dataset.X, classification_dataset.y,
                            classification_dataset.w,
                            classification_dataset.ids)
    # Check ids are unchanged.
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt
    # Check X is unchanged since this is a w transformer
    np.testing.assert_allclose(X, X_t)
    # Check y is unchanged since this is a w transformer
    np.testing.assert_allclose(y, y_t)
    for ind, task in enumerate(classification_dataset.get_task_names()):
      y_task = y_t[:, ind]
      w_task = w_t[:, ind]
      w_orig_task = w[:, ind]
      # Assert that entries with zero weight retain zero weight
      np.testing.assert_allclose(w_task[w_orig_task == 0],
                                 np.zeros_like(w_task[w_orig_task == 0]))
      # Check that sum of 0s equals sum of 1s in transformed for each task
      assert np.isclose(
          np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))

  def test_multitask_balancing_transformer(self):
    """Test balancing transformer on multitask dataset."""
    multitask_dataset = load_multitask_data()
    balancing_transformer = dc.trans.BalancingTransformer(
        transform_w=True, dataset=multitask_dataset)
    X, y, w, ids = (multitask_dataset.X, multitask_dataset.y,
                    multitask_dataset.w, multitask_dataset.ids)
    multitask_dataset = balancing_transformer.transform(multitask_dataset)
    X_t, y_t, w_t, ids_t = (multitask_dataset.X, multitask_dataset.y,
                            multitask_dataset.w, multitask_dataset.ids)
    # Check ids are unchanged.
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt
    # Check X is unchanged since this is a w transformer
    np.testing.assert_allclose(X, X_t)
    # Check y is unchanged since this is a w transformer
    np.testing.assert_allclose(y, y_t)
    for ind, task in enumerate(multitask_dataset.get_task_names()):
      y_task = y_t[:, ind]
      w_task = w_t[:, ind]
      w_orig_task = w[:, ind]
      # Assert that entries with zero weight retain zero weight
      np.testing.assert_allclose(w_task[w_orig_task == 0],
                                 np.zeros_like(w_task[w_orig_task == 0]))
      # Check that sum of 0s equals sum of 1s in transformed for each task
      assert np.isclose(
          np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))

  def test_coulomb_fit_transformer(self):
    """Test coulomb fit transformer on singletask dataset."""
    n_samples = 10
+71 −22
Original line number Diff line number Diff line
@@ -771,15 +771,37 @@ class LogTransformer(Transformer):
class BalancingTransformer(Transformer):
  """Balance positive and negative examples for weights.

  This class balances the sample weights so that the sum of all example
  weights from all classes is the same. This can be useful when you're
  working on an imbalanced dataset where there are far fewer examples of some
  classes than others.

  Example
  -------

  Here's an example for a binary dataset.

  >>> n_samples = 10
  >>> n_features = 3
  >>> n_tasks = 1
  >>> n_classes = 2
  >>> ids = np.arange(n_samples)
  >>> X = np.random.rand(n_samples, n_features)
  >>> y = np.random.randint(n_classes, size=(n_samples, n_tasks))
  >>> w = np.ones((n_samples, n_tasks))
  >>> dataset = dc.data.NumpyDataset(X, y, w, ids)
  >>> transformer = dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)
  >>> dataset = transformer.transform(dataset)

  And here's a multiclass dataset example.

  >>> n_samples = 50
  >>> n_features = 3
  >>> n_tasks = 1
  >>> n_classes = 5
  >>> ids = np.arange(n_samples)
  >>> X = np.random.rand(n_samples, n_features)
  >>> y = np.random.randint(2, size=(n_samples, n_tasks))
  >>> y = np.random.randint(n_classes, size=(n_samples, n_tasks))
  >>> w = np.ones((n_samples, n_tasks))
  >>> dataset = dc.data.NumpyDataset(X, y, w, ids)
  >>> transformer = dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)
@@ -787,20 +809,21 @@ class BalancingTransformer(Transformer):

  Note
  ----
  This class can only transform `w`. Note at present this class only supports
  binary datasets and not multiclass datasets.
  This transformer is only meaningful for classification datasets where `y`
  takes on a limited set of values. This class can only transform `w` and does
  not transform `X` or `y`.

  Raises
  ------
  `ValueError` if `transform_X` or `transform_y` are set.
  `ValueError` if `transform_X` or `transform_y` are set. Also raises
  `ValueError` if `y` or `w` aren't of shape `(N,)` or `(N, n_tasks)`.
  """

  def __init__(self,
               transform_X=False,
               transform_y=False,
               transform_w=False,
               dataset=None,
               seed=None):
               dataset=None):
    # BalancingTransformer can only transform weights.
    if transform_X or transform_y:
      raise ValueError("Cannot transform X or y")
@@ -815,22 +838,35 @@ class BalancingTransformer(Transformer):
    # Compute weighting factors from dataset.
    y = dataset.y
    w = dataset.w
    # Handle 1-D case
    if len(y.shape) == 1:
      y = np.reshape(y, (len(y), 1))
    if len(w.shape) == 1:
      w = np.reshape(w, (len(w), 1))
    if len(y.shape) != 2:
      raise ValueError("y must be of shape (N,) or (N, n_tasks)")
    if len(w.shape) != 2:
      raise ValueError("w must be of shape (N,) or (N, n_tasks)")
    # Ensure dataset is binary
    np.testing.assert_allclose(sorted(np.unique(y)), np.array([0., 1.]))
    self.classes = sorted(np.unique(y))
    #np.testing.assert_allclose(sorted(np.unique(y)), np.array([0., 1.]))
    weights = []
    for ind, task in enumerate(dataset.get_task_names()):
      task_w = w[:, ind]
      task_y = y[:, ind]
      # Remove labels with zero weights
      task_y = task_y[task_w != 0]
      num_positives = np.count_nonzero(task_y)
      num_negatives = len(task_y) - num_positives
      if num_positives > 0:
        pos_weight = float(num_negatives) / num_positives
      else:
        pos_weight = 1
      neg_weight = 1
      weights.append((neg_weight, pos_weight))
      N_task = len(task_y)
      class_counts = []
      # Note that by definition of classes, num_c >= 1 for all classes
      for c in self.classes:
        # this works because task_y is 1D
        num_c = len(np.where(task_y == c)[0])
        class_counts.append(num_c)
      # This is the right ratio since N_task/num_c * num_c = N_task
      # for all classes
      class_weights = [N_task / float(num_c) for num_c in class_counts]
      weights.append(class_weights)
    self.weights = weights

  def transform_array(self, X, y, w):
@@ -855,13 +891,26 @@ class BalancingTransformer(Transformer):
      Transformed array of weights
    """
    w_balanced = np.zeros_like(w)
    for ind in range(y.shape[1]):
    if len(y.shape) == 1:
      n_tasks = 1
    elif len(y.shape) == 2:
      n_tasks = y.shape[1]
    else:
      raise ValueError("y must be of shape (N,) or (N, n_tasks)")
    for ind in range(n_tasks):
      if n_tasks == 1:
        task_y = y
        task_w = w
      else:
        task_y = y[:, ind]
        task_w = w[:, ind]
      zero_indices = np.logical_and(task_y == 0, task_w != 0)
      one_indices = np.logical_and(task_y == 1, task_w != 0)
      w_balanced[zero_indices, ind] = self.weights[ind][0]
      w_balanced[one_indices, ind] = self.weights[ind][1]
      for i, c in enumerate(self.classes):
        class_indices = np.logical_and(task_y == c, task_w != 0)
        # Set to the class weight computed previously
        if n_tasks == 1:
          w_balanced[class_indices] = self.weights[ind][i]
        else:
          w_balanced[class_indices, ind] = self.weights[ind][i]
    return (X, y, w_balanced)