Commit f3a9896f authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Changes

parent 5202bd7d
Loading
Loading
Loading
Loading
+6 −6
Original line number Original line Diff line number Diff line
@@ -1321,23 +1321,23 @@ class DiskDataset(Dataset):


    >> newx, newy, neww = fn(x, y, w)
    >> newx, newy, neww = fn(x, y, w)


    It might be called only once with the whole dataset, or multiple times with different
    It might be called only once with the whole dataset, or multiple times
    subsets of the data.  Each time it is called, it should transform the samples and return
    with different subsets of the data.  Each time it is called, it should
    the transformed data.
    transform the samples and return the transformed data.


    Parameters
    Parameters
    ----------
    ----------
    fn: function
    fn: function
      A function to apply to each sample in the dataset
      A function to apply to each sample in the dataset
    out_dir: string
    out_dir: string
      The directory to save the new dataset in.  If this is omitted, a temporary directory
      The directory to save the new dataset in.  If this is omitted, a
      is created automatically
      temporary directory is created automatically


    Returns
    Returns
    -------
    -------
    a newly constructed Dataset object
    a newly constructed Dataset object
    """
    """
    if 'out_dir' in args:
    if 'out_dir' in args and args['out_dir'] is not None:
      out_dir = args['out_dir']
      out_dir = args['out_dir']
    else:
    else:
      out_dir = tempfile.mkdtemp()
      out_dir = tempfile.mkdtemp()
+162 −129
Original line number Original line Diff line number Diff line
import os
import numpy as np
import numpy as np
import unittest
import unittest
import deepchem as dc
import deepchem as dc
import itertools
import itertools
import os
import tempfile



class TestBalancingTransformer(unittest.TestCase):
  """
  Test BalancingTransformer functionality. 
  """


  def test_binary_1d(self):
def test_binary_1d():
  """Test balancing transformer on single-task dataset without explicit task dimension."""
  """Test balancing transformer on single-task dataset without explicit task dimension."""
  n_samples = 20
  n_samples = 20
  n_features = 3
  n_features = 3
@@ -42,7 +38,8 @@ class TestBalancingTransformer(unittest.TestCase):
  # Check that sum of 0s equals sum of 1s in transformed for each task
  # Check that sum of 0s equals sum of 1s in transformed for each task
  assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))
  assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))


  def test_binary_singletask(self):

def test_binary_singletask():
  """Test balancing transformer on single-task dataset."""
  """Test balancing transformer on single-task dataset."""
  n_samples = 20
  n_samples = 20
  n_features = 3
  n_features = 3
@@ -74,10 +71,10 @@ class TestBalancingTransformer(unittest.TestCase):
    np.testing.assert_allclose(w_task[w_orig_task == 0],
    np.testing.assert_allclose(w_task[w_orig_task == 0],
                               np.zeros_like(w_task[w_orig_task == 0]))
                               np.zeros_like(w_task[w_orig_task == 0]))
    # Check that sum of 0s equals sum of 1s in transformed for each task
    # Check that sum of 0s equals sum of 1s in transformed for each task
      assert np.isclose(
    assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))
          np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))



  def test_binary_multitask(self):
def test_binary_multitask():
  """Test balancing transformer on multitask dataset."""
  """Test balancing transformer on multitask dataset."""
  n_samples = 10
  n_samples = 10
  n_features = 3
  n_features = 3
@@ -110,10 +107,10 @@ class TestBalancingTransformer(unittest.TestCase):
    np.testing.assert_allclose(w_task[w_orig_task == 0],
    np.testing.assert_allclose(w_task[w_orig_task == 0],
                               np.zeros_like(w_task[w_orig_task == 0]))
                               np.zeros_like(w_task[w_orig_task == 0]))
    # Check that sum of 0s equals sum of 1s in transformed for each task
    # Check that sum of 0s equals sum of 1s in transformed for each task
      assert np.isclose(
    assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))
          np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))



  def test_multiclass_singletask(self):
def test_multiclass_singletask():
  """Test balancing transformer on single-task dataset."""
  """Test balancing transformer on single-task dataset."""
  n_samples = 50
  n_samples = 50
  n_features = 3
  n_features = 3
@@ -146,3 +143,39 @@ class TestBalancingTransformer(unittest.TestCase):
        continue
        continue
      assert np.isclose(
      assert np.isclose(
          np.sum(w_task[y_task == i]), np.sum(w_task[y_task == j]))
          np.sum(w_task[y_task == i]), np.sum(w_task[y_task == j]))


def test_transform_to_directory():
  """Test that output can be written to a directory."""
  n_samples = 20
  n_features = 3
  n_classes = 2
  np.random.seed(123)
  ids = np.arange(n_samples)
  X = np.random.rand(n_samples, n_features)
  y = np.random.randint(n_classes, size=(n_samples,))
  w = np.ones((n_samples,))
  dataset = dc.data.NumpyDataset(X, y, w)

  balancing_transformer = dc.trans.BalancingTransformer(
      transform_w=True, dataset=dataset)
  with tempfile.TemporaryDirectory() as tmpdirname:
    dataset = balancing_transformer.transform(dataset, out_dir=tmpdirname)
    balanced_dataset = dc.data.DiskDataset(tmpdirname)
    X_t, y_t, w_t, ids_t = (balanced_dataset.X, balanced_dataset.y,
                            balanced_dataset.w, balanced_dataset.ids)
  # Check ids are unchanged.
  for id_elt, id_t_elt in zip(ids, ids_t):
    assert id_elt == id_t_elt
  # Check X is unchanged since this is a w transformer
  np.testing.assert_allclose(X, X_t)
  # Check y is unchanged since this is a w transformer
  np.testing.assert_allclose(y, y_t)
  y_task = y_t
  w_task = w_t
  w_orig_task = w
  # Assert that entries with zero weight retain zero weight
  np.testing.assert_allclose(w_task[w_orig_task == 0],
                             np.zeros_like(w_task[w_orig_task == 0]))
  # Check that sum of 0s equals sum of 1s in transformed for each task
  assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))
+8 −2
Original line number Original line Diff line number Diff line
@@ -153,7 +153,7 @@ class Transformer(object):
    raise NotImplementedError(
    raise NotImplementedError(
        "Each Transformer is responsible for its own untransform method.")
        "Each Transformer is responsible for its own untransform method.")


  def transform(self, dataset, parallel=False, **kwargs):
  def transform(self, dataset, parallel=False, out_dir=None, **kwargs):
    """Transforms all internally stored data in dataset.
    """Transforms all internally stored data in dataset.


    This method transforms all internal data in the provided dataset by using
    This method transforms all internal data in the provided dataset by using
@@ -175,12 +175,18 @@ class Transformer(object):
    -------
    -------
    a newly constructed Dataset object
    a newly constructed Dataset object
    """
    """
    # Add this case in to handle non-DiskDataset that should be written to disk
    if out_dir is not None:
      if not isinstance(dataset, dc.data.DiskDataset):
        dataset = dc.data.DiskDataset.from_numpy(dataset.X, dataset.y,
                                                 dataset.w, dataset.ids)
    _, y_shape, w_shape, _ = dataset.get_shape()
    _, y_shape, w_shape, _ = dataset.get_shape()
    if y_shape == tuple() and self.transform_y:
    if y_shape == tuple() and self.transform_y:
      raise ValueError("Cannot transform y when y_values are not present")
      raise ValueError("Cannot transform y when y_values are not present")
    if w_shape == tuple() and self.transform_w:
    if w_shape == tuple() and self.transform_w:
      raise ValueError("Cannot transform w when w_values are not present")
      raise ValueError("Cannot transform w when w_values are not present")
    return dataset.transform(lambda X, y, w: self.transform_array(X, y, w))
    return dataset.transform(
        lambda X, y, w: self.transform_array(X, y, w), out_dir=out_dir)


  def transform_on_array(self, X, y, w):
  def transform_on_array(self, X, y, w):
    """Transforms numpy arrays X, y, and w
    """Transforms numpy arrays X, y, and w