Unverified Commit 9223166b authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #1980 from deepchem/balancing

Overhauling Balancing Transformer
parents b5edbb45 56704e41
Loading
Loading
Loading
Loading
+37 −3
Original line number Diff line number Diff line
@@ -1655,6 +1655,36 @@ class DiskDataset(Dataset):
    return np.array(
        load_from_disk(os.path.join(self.data_dir, row['ids'])), dtype=object)

  def get_shard_y(self, i):
    """Retrieves the labels for the i-th shard from disk.

    Parameters
    ----------
    i: int
      Shard index for shard to retrieve labels from
    """

    if self._cached_shards is not None and self._cached_shards[i] is not None:
      return self._cached_shards[i].y
    row = self.metadata_df.iloc[i]
    return np.array(
        load_from_disk(os.path.join(self.data_dir, row['y'])), dtype=object)

  def get_shard_w(self, i):
    """Retrieves the weights for the i-th shard from disk.

    Parameters
    ----------
    i: int
      Shard index for shard to retrieve weights from
    """

    if self._cached_shards is not None and self._cached_shards[i] is not None:
      return self._cached_shards[i].w
    row = self.metadata_df.iloc[i]
    return np.array(
        load_from_disk(os.path.join(self.data_dir, row['w'])), dtype=object)

  def add_shard(self, X, y, w, ids):
    """Adds a data shard."""
    metadata_rows = self.metadata_df.values.tolist()
@@ -1758,9 +1788,12 @@ class DiskDataset(Dataset):
  @property
  def y(self):
    """Get the y vector for this dataset as a single numpy array."""
    if len(self) == 0:
      return np.array([])
    ys = []
    one_dimensional = False
    for (_, y_b, _, _) in self.itershards():
    for i in range(self.get_number_shards()):
      y_b = self.get_shard_y(i)
      ys.append(y_b)
      if len(y_b.shape) == 1:
        one_dimensional = True
@@ -1774,8 +1807,9 @@ class DiskDataset(Dataset):
    """Get the weight vector for this dataset as a single numpy array."""
    ws = []
    one_dimensional = False
    for (_, _, w_b, _) in self.itershards():
      ws.append(np.array(w_b))
    for i in range(self.get_number_shards()):
      w_b = self.get_shard_w(i)
      ws.append(w_b)
      if len(w_b.shape) == 1:
        one_dimensional = True
    if not one_dimensional:
+30 −0
Original line number Diff line number Diff line
import numpy as np
import deepchem as dc


def test_y_property():
  """Test that dataset.y works."""
  num_datapoints = 10
  num_features = 10
  num_tasks = 1
  X = np.random.rand(num_datapoints, num_features)
  y = np.random.randint(2, size=(num_datapoints, num_tasks))
  w = np.ones((num_datapoints, num_tasks))
  ids = np.array(["id"] * num_datapoints)
  dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids)
  y_out = dataset.y
  np.testing.assert_array_equal(y, y_out)


def test_w_property():
  """Test that dataset.y works."""
  num_datapoints = 10
  num_features = 10
  num_tasks = 1
  X = np.random.rand(num_datapoints, num_features)
  y = np.random.randint(2, size=(num_datapoints, num_tasks))
  w = np.ones((num_datapoints, num_tasks))
  ids = np.array(["id"] * num_datapoints)
  dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids)
  w_out = dataset.w
  np.testing.assert_array_equal(w, w_out)
+1 −1
Original line number Diff line number Diff line
@@ -581,7 +581,7 @@ class TestSplitter(unittest.TestCase):
      w = dataset.w
      # verify that there are no rows (samples) in weights matrix w
      # that have no hits.
      assert len(np.where(~w.any(axis=1))[0]) == 0
      assert len(np.where(w.any(axis=1) == 0)[0]) == 0

  def test_indice_split(self):

+148 −0
Original line number Diff line number Diff line
import numpy as np
import unittest
import deepchem as dc
import itertools
import os


class TestBalancingTransformer(unittest.TestCase):
  """
  Test BalancingTransformer functionality. 
  """

  def test_binary_1d(self):
    """Test balancing transformer on single-task dataset without explicit task dimension."""
    n_samples = 20
    n_features = 3
    n_classes = 2
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.randint(n_classes, size=(n_samples,))
    w = np.ones((n_samples,))
    dataset = dc.data.NumpyDataset(X, y, w)

    balancing_transformer = dc.trans.BalancingTransformer(
        transform_w=True, dataset=dataset)
    dataset = balancing_transformer.transform(dataset)
    X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
    # Check ids are unchanged.
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt
    # Check X is unchanged since this is a w transformer
    np.testing.assert_allclose(X, X_t)
    # Check y is unchanged since this is a w transformer
    np.testing.assert_allclose(y, y_t)
    y_task = y_t
    w_task = w_t
    w_orig_task = w
    # Assert that entries with zero weight retain zero weight
    np.testing.assert_allclose(w_task[w_orig_task == 0],
                               np.zeros_like(w_task[w_orig_task == 0]))
    # Check that sum of 0s equals sum of 1s in transformed for each task
    assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))

  def test_binary_singletask(self):
    """Test balancing transformer on single-task dataset."""
    n_samples = 20
    n_features = 3
    n_tasks = 1
    n_classes = 2
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.randint(n_classes, size=(n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
    dataset = dc.data.NumpyDataset(X, y, w)

    balancing_transformer = dc.trans.BalancingTransformer(
        transform_w=True, dataset=dataset)
    dataset = balancing_transformer.transform(dataset)
    X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
    # Check ids are unchanged.
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt
    # Check X is unchanged since this is a w transformer
    np.testing.assert_allclose(X, X_t)
    # Check y is unchanged since this is a w transformer
    np.testing.assert_allclose(y, y_t)
    for ind, task in enumerate(dataset.get_task_names()):
      y_task = y_t[:, ind]
      w_task = w_t[:, ind]
      w_orig_task = w[:, ind]
      # Assert that entries with zero weight retain zero weight
      np.testing.assert_allclose(w_task[w_orig_task == 0],
                                 np.zeros_like(w_task[w_orig_task == 0]))
      # Check that sum of 0s equals sum of 1s in transformed for each task
      assert np.isclose(
          np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))

  def test_binary_multitask(self):
    """Test balancing transformer on multitask dataset."""
    n_samples = 10
    n_features = 3
    n_tasks = 5
    n_classes = 2
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.randint(n_classes, size=(n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
    multitask_dataset = dc.data.NumpyDataset(X, y, w)
    balancing_transformer = dc.trans.BalancingTransformer(
        transform_w=True, dataset=multitask_dataset)
    #X, y, w, ids = (multitask_dataset.X, multitask_dataset.y,
    #                multitask_dataset.w, multitask_dataset.ids)
    multitask_dataset = balancing_transformer.transform(multitask_dataset)
    X_t, y_t, w_t, ids_t = (multitask_dataset.X, multitask_dataset.y,
                            multitask_dataset.w, multitask_dataset.ids)
    # Check ids are unchanged.
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt
    # Check X is unchanged since this is a w transformer
    np.testing.assert_allclose(X, X_t)
    # Check y is unchanged since this is a w transformer
    np.testing.assert_allclose(y, y_t)
    for ind, task in enumerate(multitask_dataset.get_task_names()):
      y_task = y_t[:, ind]
      w_task = w_t[:, ind]
      w_orig_task = w[:, ind]
      # Assert that entries with zero weight retain zero weight
      np.testing.assert_allclose(w_task[w_orig_task == 0],
                                 np.zeros_like(w_task[w_orig_task == 0]))
      # Check that sum of 0s equals sum of 1s in transformed for each task
      assert np.isclose(
          np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))

  def test_multiclass_singletask(self):
    """Test balancing transformer on single-task dataset."""
    n_samples = 50
    n_features = 3
    n_tasks = 1
    n_classes = 5
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.randint(n_classes, size=(n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
    dataset = dc.data.NumpyDataset(X, y, w)

    balancing_transformer = dc.trans.BalancingTransformer(
        transform_w=True, dataset=dataset)
    dataset = balancing_transformer.transform(dataset)
    X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
    # Check ids are unchanged.
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt
    # Check X is unchanged since this is a w transformer
    np.testing.assert_allclose(X, X_t)
    # Check y is unchanged since this is a w transformer
    np.testing.assert_allclose(y, y_t)
    for ind, task in enumerate(dataset.get_task_names()):
      y_task = y_t[:, ind]
      w_task = w_t[:, ind]
      w_orig_task = w[:, ind]
      # Check that sum of 0s equals sum of 1s in transformed for each task
      for i, j in itertools.product(range(n_classes), range(n_classes)):
        if i == j:
          continue
        assert np.isclose(
            np.sum(w_task[y_task == i]), np.sum(w_task[y_task == j]))
+108 −0
Original line number Diff line number Diff line
import os
import numpy as np
import deepchem as dc


def load_solubility_data():
  """Loads solubility dataset"""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["log-solubility"]
  task_type = "regression"
  input_file = os.path.join(current_dir, "../../models/tests/example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)

  return loader.create_dataset(input_file)


def test_y_minmax_transformer():
  """Tests MinMax transformer."""
  solubility_dataset = load_solubility_data()
  minmax_transformer = dc.trans.MinMaxTransformer(
      transform_y=True, dataset=solubility_dataset)
  X, y, w, ids = (solubility_dataset.X, solubility_dataset.y,
                  solubility_dataset.w, solubility_dataset.ids)
  solubility_dataset = minmax_transformer.transform(solubility_dataset)
  X_t, y_t, w_t, ids_t = (solubility_dataset.X, solubility_dataset.y,
                          solubility_dataset.w, solubility_dataset.ids)

  # Check ids are unchanged before and after transformation
  for id_elt, id_t_elt in zip(ids, ids_t):
    assert id_elt == id_t_elt

  # Check X is unchanged since transform_y is true
  np.testing.assert_allclose(X, X_t)
  # Check w is unchanged since transform_y is true
  np.testing.assert_allclose(w, w_t)

  # Check minimum and maximum values of transformed y are 0 and 1
  np.testing.assert_allclose(y_t.min(), 0.)
  np.testing.assert_allclose(y_t.max(), 1.)

  # Check untransform works correctly
  y_restored = minmax_transformer.untransform(y_t)
  assert np.max(y_restored - y) < 1e-5


def test_y_minmax_random():
  """Test on random example"""
  n_samples = 100
  n_features = 10
  n_tasks = 10

  X = np.random.randn(n_samples, n_features)
  y = np.random.randn(n_samples, n_tasks)
  dataset = dc.data.NumpyDataset(X, y)

  minmax_transformer = dc.trans.MinMaxTransformer(
      transform_y=True, dataset=dataset)
  w, ids = dataset.w, dataset.ids

  dataset = minmax_transformer.transform(dataset)
  X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
  # Check ids are unchanged before and after transformation
  for id_elt, id_t_elt in zip(ids, ids_t):
    assert id_elt == id_t_elt

  # Check X is unchanged since transform_y is true
  np.testing.assert_allclose(X, X_t)
  # Check w is unchanged since transform_y is true
  np.testing.assert_allclose(w, w_t)

  # Check minimum and maximum values of transformed y are 0 and 1
  np.testing.assert_allclose(y_t.min(), 0.)
  np.testing.assert_allclose(y_t.max(), 1.)

  # Test if dimensionality expansion is handled correctly by untransform
  y_t = np.expand_dims(y_t, axis=-1)
  y_restored = minmax_transformer.untransform(y_t)
  assert y_restored.shape == y.shape + (1,)
  np.testing.assert_allclose(np.squeeze(y_restored, axis=-1), y)


def test_X_minmax_transformer():
  solubility_dataset = load_solubility_data()
  minmax_transformer = dc.trans.MinMaxTransformer(
      transform_X=True, dataset=solubility_dataset)
  X, y, w, ids = (solubility_dataset.X, solubility_dataset.y,
                  solubility_dataset.w, solubility_dataset.ids)
  solubility_dataset = minmax_transformer.transform(solubility_dataset)
  X_t, y_t, w_t, ids_t = (solubility_dataset.X, solubility_dataset.y,
                          solubility_dataset.w, solubility_dataset.ids)

  # Check ids are unchanged before and after transformation
  for id_elt, id_t_elt in zip(ids, ids_t):
    assert id_elt == id_t_elt

  # Check X is unchanged since transform_y is true
  np.testing.assert_allclose(y, y_t)
  # Check w is unchanged since transform_y is true
  np.testing.assert_allclose(w, w_t)

  # Check minimum and maximum values of transformed y are 0 and 1
  np.testing.assert_allclose(X_t.min(), 0.)
  np.testing.assert_allclose(X_t.max(), 1.)

  # Check untransform works correctly
  np.testing.assert_allclose(minmax_transformer.untransform(X_t), X)
Loading