Commit 8616f858 authored by Vignesh's avatar Vignesh
Browse files

MinMax Transformer and tests

parent e9878ee7
Loading
Loading
Loading
Loading
+87 −0
Original line number Diff line number Diff line
@@ -172,6 +172,93 @@ class TestTransformers(unittest.TestCase):
    # Check that untransform does the right thing.
    np.testing.assert_allclose(log_transformer.untransform(X_t), X)

  def test_y_minmax_transformer(self):
    """Tests MinMax transformer. """
    solubility_dataset = dc.data.tests.load_solubility_data()
    minmax_transformer = dc.trans.MinMaxTransformer(
        transform_y=True, dataset=solubility_dataset)
    X, y, w, ids = (solubility_dataset.X, solubility_dataset.y,
                    solubility_dataset.w, solubility_dataset.ids)
    solubility_dataset = minmax_transformer.transform(solubility_dataset)
    X_t, y_t, w_t, ids_t = (solubility_dataset.X, solubility_dataset.y,
                            solubility_dataset.w, solubility_dataset.ids)

    # Check ids are unchanged before and after transformation
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt

    # Check X is unchanged since transform_y is true
    np.testing.assert_allclose(X, X_t)
    # Check w is unchanged since transform_y is true
    np.testing.assert_allclose(w, w_t)

    # Check minimum and maximum values of transformed y are 0 and 1
    np.testing.assert_allclose(y_t.min(), 0.)
    np.testing.assert_allclose(y_t.max(), 1.)

    # Check untransform works correctly
    np.testing.assert_allclose(minmax_transformer.untransform(y_t), y)

    # Test on random example
    n_samples = 100
    n_features = 10
    n_tasks = 10

    X = np.random.randn(n_samples, n_features)
    y = np.random.randn(n_samples, n_tasks)
    dataset = dc.data.NumpyDataset(X, y)

    minmax_transformer = dc.trans.MinMaxTransformer(
        transform_y=True, dataset=dataset)
    w, ids = dataset.w, dataset.ids

    dataset = minmax_transformer.transform(dataset)
    X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
    # Check ids are unchanged before and after transformation
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt

    # Check X is unchanged since transform_y is true
    np.testing.assert_allclose(X, X_t)
    # Check w is unchanged since transform_y is true
    np.testing.assert_allclose(w, w_t)

    # Check minimum and maximum values of transformed y are 0 and 1
    np.testing.assert_allclose(y_t.min(), 0.)
    np.testing.assert_allclose(y_t.max(), 1.)

    # Test if dimensionality expansion is handled correctly by untransform
    y_t = np.expand_dims(y_t, axis=-1)
    y_restored = minmax_transformer.untransform(y_t)
    assert y_restored.shape == y.shape + (1,)
    np.testing.assert_allclose(np.squeeze(y_restored, axis=-1), y)

  def test_X_minmax_transformer(self):
    solubility_dataset = dc.data.tests.load_solubility_data()
    minmax_transformer = dc.trans.MinMaxTransformer(
        transform_X=True, dataset=solubility_dataset)
    X, y, w, ids = (solubility_dataset.X, solubility_dataset.y,
                    solubility_dataset.w, solubility_dataset.ids)
    solubility_dataset = minmax_transformer.transform(solubility_dataset)
    X_t, y_t, w_t, ids_t = (solubility_dataset.X, solubility_dataset.y,
                            solubility_dataset.w, solubility_dataset.ids)

    # Check ids are unchanged before and after transformation
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt

    # Check X is unchanged since transform_y is true
    np.testing.assert_allclose(y, y_t)
    # Check w is unchanged since transform_y is true
    np.testing.assert_allclose(w, w_t)

    # Check minimum and maximum values of transformed y are 0 and 1
    np.testing.assert_allclose(X_t.min(), 0.)
    np.testing.assert_allclose(X_t.max(), 1.)

    # Check untransform works correctly
    np.testing.assert_allclose(minmax_transformer.untransform(X_t), X)

  def test_y_normalization_transformer(self):
    """Tests normalization transformer."""
    solubility_dataset = dc.data.tests.load_solubility_data()
+64 −6
Original line number Diff line number Diff line
@@ -104,15 +104,54 @@ class Transformer(object):


class MinMaxTransformer(Transformer):
  """MinMax transformer transforms the dataset by shifting each axis of X or y
  (depending on whether transform_X or transform_y is True), except the first one
  by the minimum value along the axis and dividing the result by the range
  (maximum value - minimum value) along the axis. This ensures each axis is
  between 0 and 1. In case of multi-task learning, it ensures each task is given
  equal importance.

  Given original array A, the transformed array can be written as:
  A_min = np.min(A, axis=0)
  A_max = np.max(A, axis=0)
  A_t = np.nan_to_num((A - A_min)/(A_max - A_min))

  Example:
  >>> n_samples = 10
  >>> n_features = 3
  >>> n_tasks = 1
  >>> ids = np.arange(n_samples)
  >>> X = np.random.rand(n_samples, n_features)
  >>> y = np.zeros((n_samples, n_tasks))
  >>> w = np.ones((n_samples, n_tasks))
  >>> dataset = dc.data.NumpyDataset(X, y, w, ids)
  >>> transformer = dc.trans.MinMaxTransformer(transform_y=True)
  >>> dataset = transformer.transform(dataset)
  """

  def __init__(self,
               transform_X=False,
               transform_y=False,
               transform_w=False,
               dataset=None):
    """Initialization of MinMax transformer.

    Parameters
    ----------
    transform_X: bool, optional (default False)
      Whether to transform X
    transform_y: bool, optional (default False)
      Whether to transform y
    transform_w: bool, optional (default False)
      Whether to transform w
    dataset: dc.data.Dataset object, optional
      Dataset to be transformed
    """
    if transform_X:
      raise NotImplementedError("MinMax transformer does not work for X yet.")
    if transform_y:
      self.X_min = np.min(dataset.X, axis=0)
      self.X_max = np.max(dataset.X, axis=0)

    elif transform_y:
      self.y_min = np.min(dataset.y, axis=0)
      self.y_max = np.max(dataset.y, axis=0)

@@ -126,26 +165,45 @@ class MinMaxTransformer(Transformer):
        dataset=dataset)

  def transform(self, dataset, parallel=False):
    """Transforms the dataset."""
    return super(MinMaxTransformer, self).transform(dataset, parallel=parallel)

  def transform_array(self, X, y, w):
    """Transform the data in a set of (X, y, w) arrays."""
    if self.transform_X:
      raise NotImplementedError("MinMax transformer does not work for X yet")
    if self.transform_y:
      X = np.nan_to_num((X - self.X_min) / (self.X_max - self.X_min))
    elif self.transform_y:
      y = np.nan_to_num((y - self.y_min) / (self.y_max - self.y_min))
    return (X, y, w)

  def untransform(self, z):
    """
    Undo transformation on provided data.

    Parameters
    ----------
    z: np.ndarray,
      Transformed X or y array
    """
    if self.transform_X:
      raise NotImplementedError("MinMax does not work for X yet.")
    if self.transform_y:
      X_max = self.X_max
      X_min = self.X_min

      return z * (X_max - X_min) + X_min

    elif self.transform_y:
      y_min = self.y_min
      y_max = self.y_max

      n_tasks = len(y_min)
      z_shape = list(z.shape)
      z_shape.reverse()

      for dim in z_shape:
        if dim != n_tasks and dim == 1:
          y_min = np.expand_dims(y_min, -1)
          y_max = np.expand_dims(y_max, -1)

      y = z * (y_max - y_min) + y_min
      return y