Commit a3e0fb2a authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #286 from flee2/master

y-transform and untransform for CDF and power transformers
parents bab3488a c5f9a908
Loading
Loading
Loading
Loading
+36 −17
Original line number Diff line number Diff line
@@ -200,8 +200,7 @@ class TestTransformers(unittest.TestCase):
    target = np.transpose(np.array(np.append([target],[target], axis=0)))
    gaussian_dataset = dc.data.tests.load_gaussian_cdf_data()
    bins=1001
    cdf_transformer = dc.trans.CDFTransformer(transform_X=True,
                                                     bins=bins)
    cdf_transformer = dc.trans.CDFTransformer(transform_X=True, dataset=gaussian_dataset, bins=bins)
    X, y, w, ids = (gaussian_dataset.X,gaussian_dataset.y,gaussian_dataset.w,gaussian_dataset.ids)
    gaussian_dataset = cdf_transformer.transform(gaussian_dataset, bins=bins)
    X_t, y_t, w_t, ids_t = (gaussian_dataset.X,gaussian_dataset.y,gaussian_dataset.w,gaussian_dataset.ids)
@@ -217,21 +216,16 @@ class TestTransformers(unittest.TestCase):
    sorted = np.sort(X_t,axis=0)
    np.testing.assert_allclose(sorted, target)

  """
  def test_cdf_y_transformer(self):
    #Test CDF transformer on Gaussian normal dataset.
    target = np.array(np.transpose(np.linspace(0.,1.,1001)))
    target = np.transpose(np.array(np.append([target],[target], axis=0)))
    gaussian_dataset = dc.data.tests.load_gaussian_cdf_data()
    bins=1001
    cdf_transformer = dc.trans.CDFTransformer(transform_y=True, bins=bins)
    X, y, w, ids = (gaussian_dataset.X, gaussian_dataset.y, gaussian_dataset.w,
                    gaussian_dataset.ids)
    cdf_transformer = dc.trans.CDFTransformer(transform_y=True, dataset=gaussian_dataset, bins=bins)
    X, y, w, ids = (gaussian_dataset.X,gaussian_dataset.y,gaussian_dataset.w,gaussian_dataset.ids)
    gaussian_dataset = cdf_transformer.transform(gaussian_dataset, bins=bins)
    gaussian_dataset = dc.data.DiskDataset(
        data_dir=gaussian_dataset.data_dir,reload=True)
    X_t, y_t, w_t, ids_t = (gaussian_dataset.X, gaussian_dataset.y,
                            gaussian_dataset.w, gaussian_dataset.ids)
    X_t, y_t, w_t, ids_t = (gaussian_dataset.X,gaussian_dataset.y,gaussian_dataset.w,gaussian_dataset.ids)

    # Check ids are unchanged.
    for id_elt, id_t_elt in zip(ids, ids_t):
@@ -243,19 +237,18 @@ class TestTransformers(unittest.TestCase):
    # Check y is now holding the proper values when sorted.
    sorted = np.sort(y_t,axis=0)
    np.testing.assert_allclose(sorted, target)
  """

    # Check that untransform does the right thing.
    np.testing.assert_allclose(cdf_transformer.untransform(y_t), y)
  
  def test_power_X_transformer(self):
    """Test Power transformer on Gaussian normal dataset."""
    gaussian_dataset = dc.data.tests.load_gaussian_cdf_data()
    powers=[1,2,0.5]
    power_transformer = dc.trans.PowerTransformer(
        transform_X=True, powers=powers)
    power_transformer = dc.trans.PowerTransformer(transform_X=True, powers=powers)
    X, y, w, ids = (gaussian_dataset.X,gaussian_dataset.y,gaussian_dataset.w,gaussian_dataset.ids)
    gaussian_dataset = power_transformer.transform(gaussian_dataset)
    gaussian_dataset = dc.data.DiskDataset(
        data_dir=gaussian_dataset.data_dir,reload=True)
    X_t, y_t, w_t, ids_t = (gaussian_dataset.X,gaussian_dataset.y,gaussian_dataset.w,gaussian_dataset.ids)
    gaussian_dataset2 = power_transformer.transform(gaussian_dataset)
    X_t, y_t, w_t, ids_t = (gaussian_dataset2.X,gaussian_dataset2.y,gaussian_dataset2.w,gaussian_dataset2.ids)

    # Check ids are unchanged.
    for id_elt, id_t_elt in zip(ids, ids_t):
@@ -265,10 +258,36 @@ class TestTransformers(unittest.TestCase):
    # Check w is unchanged since this is an X transformer
    np.testing.assert_allclose(w, w_t)
    # Check X is now holding the proper values in each column.
    np.testing.assert_allclose(X_t.shape[1],len(powers)*X.shape[1])
    np.testing.assert_allclose(X, X_t[:,:2])
    np.testing.assert_allclose(np.power(X,2),X_t[:,2:4])
    np.testing.assert_allclose(np.power(X,0.5),X_t[:,4:])
  
  def test_power_y_transformer(self):
    """Test Power transformer on Gaussian normal dataset."""
    gaussian_dataset = dc.data.tests.load_gaussian_cdf_data()
    powers=[1,2,0.5]
    power_transformer = dc.trans.PowerTransformer(transform_y=True, powers=powers)
    X, y, w, ids = (gaussian_dataset.X,gaussian_dataset.y,gaussian_dataset.w,gaussian_dataset.ids)
    gaussian_dataset2 = power_transformer.transform(gaussian_dataset)
    X_t, y_t, w_t, ids_t = (gaussian_dataset2.X,gaussian_dataset2.y,gaussian_dataset2.w,gaussian_dataset2.ids)

    # Check ids are unchanged.
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt
    # Check X is unchanged since this is an X transformer
    np.testing.assert_allclose(X, X_t)
    # Check w is unchanged since this is an X transformer
    np.testing.assert_allclose(w, w_t)
    # Check y is now holding the proper values in each column.
    np.testing.assert_allclose(y_t.shape[1],len(powers)*y.shape[1])
    np.testing.assert_allclose(y, y_t[:,:2])
    np.testing.assert_allclose(np.power(y,2),y_t[:,2:4])
    np.testing.assert_allclose(np.power(y,0.5),y_t[:,4:])

    # Check that untransform does the right thing.
    np.testing.assert_allclose(power_transformer.untransform(y_t), y)

  def test_singletask_balancing_transformer(self):
    """Test balancing transformer on single-task dataset."""

+19 −17
Original line number Diff line number Diff line
@@ -533,12 +533,14 @@ class CDFTransformer(Transformer):
  """Histograms the data and assigns values based on sorted list."""
  """Acts like a Cumulative Distribution Function (CDF)."""
  def __init__(self, transform_X=False,
               transform_y=False,
               transform_y=False, dataset=None,
               bins=2):
    self.transform_X = transform_X
    self.transform_y = transform_y
    self.bins = bins
  # TODO (flee2): for transform_y, figure out weights, untransform
    self.y = dataset.y
    #self.w = dataset.w
  # TODO (flee2): for transform_y, figure out weights

  def transform(self, dataset, bins):
    """Performs CDF transform on data."""
@@ -549,16 +551,16 @@ class CDFTransformer(Transformer):
      X_t = get_cdf_values(X,self.bins)
      y_t = y
    if self.transform_y:
      print("y will not be transformed by CDFTransformer, for now.")
      """
      y_t = get_cdf_values(y,self.bins)
      X_t = X
      """
      y_t = get_cdf_values(y,self.bins)
      #print("y will not be transformed by CDFTransformer, for now.")
    return NumpyDataset(X_t, y_t, w_t, ids_t)

  def untransform(self, z):
    print("Cannot undo CDF Transformer, for now.")
    # print("Cannot undo CDF Transformer, for now.")
    # Need this for transform_y
    if self.transform_y:
      return self.y

def get_cdf_values(array, bins):
  #array = np.transpose(array)
@@ -601,23 +603,23 @@ class PowerTransformer(Transformer):
      	X_t = np.hstack((X_t,np.power(X, self.powers[i])))
      y_t = y
    if self.transform_y:
      print("y will not be transformed by PowerTransformer, for now.")
      """
      # print("y will not be transformed by PowerTransformer, for now.")
      y_t = np.power(y, self.powers[0])
      for i in range(1, n_powers):
      	y_t = np.hstack((y_t,np.power(y, self.powers[i])))
      X_t = X
    """

    # TODO (rbharath): Find a more elegant solution to saving the data?
    shutil.rmtree(dataset.data_dir)
    os.makedirs(dataset.data_dir)
    DiskDataset.from_numpy(X_t, y_t, w_t, ids_t, data_dir=dataset.data_dir)
    DiskDataset.from_numpy(dataset.data_dir, X_t, y_t, w_t, ids_t)
    return dataset
    """
    return NumpyDataset(X_t, y_t, w_t, ids_t)

  def untransform(self, z):
    print("Cannot undo Power Transformer, for now.")    
    """
    orig_len = (z.shape[1])/(self.n_powers+1)
    # print("Cannot undo Power Transformer, for now.")    
    n_powers = len(self.powers)
    orig_len = (z.shape[1])/n_powers
    z = z[:,:orig_len]
    """
    z = np.power(z, 1/self.powers[0])
    return z