Commit eec8d8b9 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #385 from joegomes/clipy

ClippingTransformer
parents 31aaf9c2 8b86216e
Loading
Loading
Loading
Loading
+50 −0
Original line number Diff line number Diff line
@@ -243,6 +243,56 @@ class TestTransformers(unittest.TestCase):
    # Check that untransform does the right thing.
    np.testing.assert_allclose(cdf_transformer.untransform(y_t), y)

  def test_clipping_X_transformer(self):
    """Test clipping transformer on X of singletask dataset."""
    n_samples = 10
    n_features = 3
    n_tasks = 1
    ids = np.arange(n_samples)
    X = np.ones((n_samples, n_features))
    target = 5.*X
    X *= 6.
    y = np.zeros((n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
    dataset = dc.data.NumpyDataset(X, y, w, ids)
    transformer = dc.trans.ClippingTransformer(transform_X=True, x_max=5.)
    clipped_dataset = transformer.transform(dataset)
    X_t, y_t, w_t, ids_t = (clipped_dataset.X, clipped_dataset.y, clipped_dataset.w, clipped_dataset.ids)
    # Check ids are unchanged.
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt
    # Check y is unchanged since this is an X transformer
    np.testing.assert_allclose(y, y_t)
    # Check w is unchanged since this is an X transformer
    np.testing.assert_allclose(w, w_t)
    # Check X is now holding the proper values when sorted.
    np.testing.assert_allclose(X_t, target)
 
  def test_clipping_y_transformer(self):
    """Test clipping transformer on y of singletask dataset."""
    n_samples = 10
    n_features = 3
    n_tasks = 1
    ids = np.arange(n_samples)
    X = np.zeros((n_samples, n_features))
    y = np.ones((n_samples, n_tasks))
    target = 5.*y
    y *= 6.
    w = np.ones((n_samples, n_tasks))
    dataset = dc.data.NumpyDataset(X, y, w, ids)
    transformer = dc.trans.ClippingTransformer(transform_y=True, y_max=5.)
    clipped_dataset = transformer.transform(dataset)
    X_t, y_t, w_t, ids_t = (clipped_dataset.X, clipped_dataset.y, clipped_dataset.w, clipped_dataset.ids)
    # Check ids are unchanged.
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt
    # Check X is unchanged since this is a y transformer
    np.testing.assert_allclose(X, X_t)
    # Check w is unchanged since this is a y transformer
    np.testing.assert_allclose(w, w_t)
    # Check y is now holding the proper values when sorted.
    np.testing.assert_allclose(y_t, target)
  
  def test_power_X_transformer(self):
    """Test Power transformer on Gaussian normal dataset."""
    gaussian_dataset = dc.data.tests.load_gaussian_cdf_data()
+138 −29
Original line number Diff line number Diff line
@@ -157,29 +157,84 @@ class NormalizationTransformer(Transformer):
      return transformed_grad

class ClippingTransformer(Transformer):
  """Clip large values in datasets.     

     Example:

     >>> n_samples = 10
     >>> n_features = 3
     >>> n_tasks = 1
     >>> ids = np.arange(n_samples)
     >>> X = np.random.rand((n_samples, n_features))
     >>> y = np.zeros((n_samples, n_tasks))
     >>> w = np.ones((n_samples, n_tasks))
     >>> dataset = dc.data.NumpyDataset(X, y, w, ids)
     >>> transformer = dc.trans.ClippingTransformer(transform_X=True)
     >>> dataset = transformer.transform(dataset)
  
  """

  def __init__(self, transform_X=False, transform_y=False,
               transform_w=False, dataset=None, max_val=5.):
    """Initialize clipping transformation."""
               transform_w=False, dataset=None, x_max=5., y_max=500.):
    """Initialize clipping transformation.

    Parameters:
    ----------
    transform_X: bool, optional (default False)
      Whether to transform X
    transform_y: bool, optional (default False)
      Whether to transform y
    transform_w: bool, optional (default False)
      Whether to transform w
    dataset: dc.data.Dataset object, optional
      Dataset to be transformed
    x_max: float, optional
      Maximum absolute value for X
    y_max: float, optional
      Maximum absolute value for y

    """
    super(ClippingTransformer, self).__init__(transform_X=transform_X,
                                              transform_y=transform_y,
                                              transform_w=transform_w,
                                              dataset=dataset)
    self.max_val = max_val
    assert not transform_w
    self.x_max = x_max
    self.y_max = y_max

  def transform_array(self, X, y, w):
    """Transform the data in a set of (X, y, w) arrays."""
    """Transform the data in a set of (X, y, w) arrays.

    Parameters:
    ----------
    X: np.ndarray
      Features
    y: np.ndarray
      Tasks
    w: np.ndarray
      Weights

    Returns:
    -------
    X: np.ndarray
      Transformed features
    y: np.ndarray
      Transformed tasks
    w: np.ndarray
      Transformed weights

    """
    if self.transform_X:
      X[X > self.max_val] = self.max_val
      X[X < (-1.0*self.max_val)] = -1.0 * self.max_val
      X[X > self.x_max] = self.x_max
      X[X < (-1.0*self.x_max)] = -1.0 * self.x_max
    if self.transform_y:
      y[y > trunc] = trunc
      y[y < (-1.0*trunc)] = -1.0 * trunc
      y[y > self.y_max] = self.y_max
      y[y < (-1.0*self.y_max)] = -1.0 * self.y_max
    return (X, y, w)

  def untransform(self, z):
    warnings.warn("Clipping cannot be undone.")
    return z
    raise NotImplementedError(
      "Cannot untransform datasets with ClippingTransformer.")

class LogTransformer(Transformer):

@@ -385,13 +440,7 @@ class PowerTransformer(Transformer):
    return z

class CoulombFitTransformer():
  """Performs randomization and binarization operations on batches of Coulomb Matrix features during fit."""
  def __init__(self, dataset):

    """Initializes CoulombFitTransformer.
    Parameters:
    ----------
    dataset: dc.data.Dataset object
  """Performs randomization and binarization operations on batches of Coulomb Matrix features during fit.

     Example:

@@ -407,6 +456,15 @@ class CoulombFitTransformer():
     >>> model = dc.models.TensorflowMultiTaskFitTransformRegressor(
            n_tasks, [n_features, n_features], batch_size=n_samples,
            fit_transformers=fit_transformers, n_evals=1)
  """

  def __init__(self, dataset):

    """Initializes CoulombFitTransformer.

    Parameters:
    ----------
    dataset: dc.data.Dataset object

    """
    X = dataset.X
@@ -422,7 +480,20 @@ class CoulombFitTransformer():
    self.std = (X - self.mean).std()

  def realize(self, X):
    """Randomize features. """
    """Randomize features.

    Parameters:
    ----------
    X: np.ndarray
      Features

    Returns:
    -------
    X: np.ndarray
      Randomized features


    """
    def _realize_(x):
      inds = np.argsort(-(x**2).sum(axis=0)**.5+np.random.normal(0,self.noise,x[0].shape))
      x = x[inds,:][:,inds]*1
@@ -431,11 +502,35 @@ class CoulombFitTransformer():
    return np.array([_realize_(z) for z in X])

  def normalize(self, X):
    """Normalize features. """
    """Normalize features. 

    Parameters:
    ----------
    X: np.ndarray
      Features

    Returns:
    -------
    X: np.ndarray
      Normalized features

    """
    return (X-self.mean)/self.std

  def expand(self, X):
    """Binarize features. """
    """Binarize features. 

    Parameters:
    ----------
    X: np.ndarray
      Features

    Returns:
    -------
    X: np.ndarray
      Binarized features

    """
    Xexp = []
    for i in range(X.shape[1]):
      for k in np.arange(0,self.max[i]+self.step,self.step):
@@ -443,6 +538,20 @@ class CoulombFitTransformer():
    return np.array(Xexp).T
      
  def X_transform(self, X):
    """Perform Coulomb Fit transform on features.

    Parameters:
    ----------
    X: np.ndarray
      Features

    Returns:
    -------
    X: np.ndarray
      Transformed features

    """

    X = self.normalize(self.expand(self.realize(X)))
    return X