Commit e49868d3 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

changes

parent a4b6db5d
Loading
Loading
Loading
Loading
+108 −0
Original line number Diff line number Diff line
import os
import numpy as np
import deepchem as dc


def load_solubility_data():
  """Loads solubility dataset"""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["log-solubility"]
  task_type = "regression"
  input_file = os.path.join(current_dir, "../../models/tests/example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)

  return loader.create_dataset(input_file)


def test_y_minmax_transformer():
  """Tests MinMax transformer."""
  solubility_dataset = load_solubility_data()
  minmax_transformer = dc.trans.MinMaxTransformer(
      transform_y=True, dataset=solubility_dataset)
  X, y, w, ids = (solubility_dataset.X, solubility_dataset.y,
                  solubility_dataset.w, solubility_dataset.ids)
  solubility_dataset = minmax_transformer.transform(solubility_dataset)
  X_t, y_t, w_t, ids_t = (solubility_dataset.X, solubility_dataset.y,
                          solubility_dataset.w, solubility_dataset.ids)

  # Check ids are unchanged before and after transformation
  for id_elt, id_t_elt in zip(ids, ids_t):
    assert id_elt == id_t_elt

  # Check X is unchanged since transform_y is true
  np.testing.assert_allclose(X, X_t)
  # Check w is unchanged since transform_y is true
  np.testing.assert_allclose(w, w_t)

  # Check minimum and maximum values of transformed y are 0 and 1
  np.testing.assert_allclose(y_t.min(), 0.)
  np.testing.assert_allclose(y_t.max(), 1.)

  # Check untransform works correctly
  y_restored = minmax_transformer.untransform(y_t)
  assert np.max(y_restored - y) < 1e-5


def test_y_minmax_random():
  """Test on random example"""
  n_samples = 100
  n_features = 10
  n_tasks = 10

  X = np.random.randn(n_samples, n_features)
  y = np.random.randn(n_samples, n_tasks)
  dataset = dc.data.NumpyDataset(X, y)

  minmax_transformer = dc.trans.MinMaxTransformer(
      transform_y=True, dataset=dataset)
  w, ids = dataset.w, dataset.ids

  dataset = minmax_transformer.transform(dataset)
  X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
  # Check ids are unchanged before and after transformation
  for id_elt, id_t_elt in zip(ids, ids_t):
    assert id_elt == id_t_elt

  # Check X is unchanged since transform_y is true
  np.testing.assert_allclose(X, X_t)
  # Check w is unchanged since transform_y is true
  np.testing.assert_allclose(w, w_t)

  # Check minimum and maximum values of transformed y are 0 and 1
  np.testing.assert_allclose(y_t.min(), 0.)
  np.testing.assert_allclose(y_t.max(), 1.)

  # Test if dimensionality expansion is handled correctly by untransform
  y_t = np.expand_dims(y_t, axis=-1)
  y_restored = minmax_transformer.untransform(y_t)
  assert y_restored.shape == y.shape + (1,)
  np.testing.assert_allclose(np.squeeze(y_restored, axis=-1), y)


def test_X_minmax_transformer():
  solubility_dataset = load_solubility_data()
  minmax_transformer = dc.trans.MinMaxTransformer(
      transform_X=True, dataset=solubility_dataset)
  X, y, w, ids = (solubility_dataset.X, solubility_dataset.y,
                  solubility_dataset.w, solubility_dataset.ids)
  solubility_dataset = minmax_transformer.transform(solubility_dataset)
  X_t, y_t, w_t, ids_t = (solubility_dataset.X, solubility_dataset.y,
                          solubility_dataset.w, solubility_dataset.ids)

  # Check ids are unchanged before and after transformation
  for id_elt, id_t_elt in zip(ids, ids_t):
    assert id_elt == id_t_elt

  # Check X is unchanged since transform_y is true
  np.testing.assert_allclose(y, y_t)
  # Check w is unchanged since transform_y is true
  np.testing.assert_allclose(w, w_t)

  # Check minimum and maximum values of transformed y are 0 and 1
  np.testing.assert_allclose(X_t.min(), 0.)
  np.testing.assert_allclose(X_t.max(), 1.)

  # Check untransform works correctly
  np.testing.assert_allclose(minmax_transformer.untransform(X_t), X)
+3 −88
Original line number Diff line number Diff line
@@ -218,93 +218,6 @@ class TestTransformers(unittest.TestCase):
    # Check that untransform does the right thing.
    np.testing.assert_allclose(log_transformer.untransform(X_t), X)

  def test_y_minmax_transformer(self):
    """Tests MinMax transformer. """
    solubility_dataset = load_solubility_data()
    minmax_transformer = dc.trans.MinMaxTransformer(
        transform_y=True, dataset=solubility_dataset)
    X, y, w, ids = (solubility_dataset.X, solubility_dataset.y,
                    solubility_dataset.w, solubility_dataset.ids)
    solubility_dataset = minmax_transformer.transform(solubility_dataset)
    X_t, y_t, w_t, ids_t = (solubility_dataset.X, solubility_dataset.y,
                            solubility_dataset.w, solubility_dataset.ids)

    # Check ids are unchanged before and after transformation
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt

    # Check X is unchanged since transform_y is true
    np.testing.assert_allclose(X, X_t)
    # Check w is unchanged since transform_y is true
    np.testing.assert_allclose(w, w_t)

    # Check minimum and maximum values of transformed y are 0 and 1
    np.testing.assert_allclose(y_t.min(), 0.)
    np.testing.assert_allclose(y_t.max(), 1.)

    # Check untransform works correctly
    np.testing.assert_allclose(minmax_transformer.untransform(y_t), y)

    # Test on random example
    n_samples = 100
    n_features = 10
    n_tasks = 10

    X = np.random.randn(n_samples, n_features)
    y = np.random.randn(n_samples, n_tasks)
    dataset = dc.data.NumpyDataset(X, y)

    minmax_transformer = dc.trans.MinMaxTransformer(
        transform_y=True, dataset=dataset)
    w, ids = dataset.w, dataset.ids

    dataset = minmax_transformer.transform(dataset)
    X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
    # Check ids are unchanged before and after transformation
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt

    # Check X is unchanged since transform_y is true
    np.testing.assert_allclose(X, X_t)
    # Check w is unchanged since transform_y is true
    np.testing.assert_allclose(w, w_t)

    # Check minimum and maximum values of transformed y are 0 and 1
    np.testing.assert_allclose(y_t.min(), 0.)
    np.testing.assert_allclose(y_t.max(), 1.)

    # Test if dimensionality expansion is handled correctly by untransform
    y_t = np.expand_dims(y_t, axis=-1)
    y_restored = minmax_transformer.untransform(y_t)
    assert y_restored.shape == y.shape + (1,)
    np.testing.assert_allclose(np.squeeze(y_restored, axis=-1), y)

  def test_X_minmax_transformer(self):
    solubility_dataset = load_solubility_data()
    minmax_transformer = dc.trans.MinMaxTransformer(
        transform_X=True, dataset=solubility_dataset)
    X, y, w, ids = (solubility_dataset.X, solubility_dataset.y,
                    solubility_dataset.w, solubility_dataset.ids)
    solubility_dataset = minmax_transformer.transform(solubility_dataset)
    X_t, y_t, w_t, ids_t = (solubility_dataset.X, solubility_dataset.y,
                            solubility_dataset.w, solubility_dataset.ids)

    # Check ids are unchanged before and after transformation
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt

    # Check X is unchanged since transform_y is true
    np.testing.assert_allclose(y, y_t)
    # Check w is unchanged since transform_y is true
    np.testing.assert_allclose(w, w_t)

    # Check minimum and maximum values of transformed y are 0 and 1
    np.testing.assert_allclose(X_t.min(), 0.)
    np.testing.assert_allclose(X_t.max(), 1.)

    # Check untransform works correctly
    np.testing.assert_allclose(minmax_transformer.untransform(X_t), X)

  def test_y_normalization_transformer(self):
    """Tests normalization transformer."""
    solubility_dataset = load_solubility_data()
@@ -413,7 +326,9 @@ class TestTransformers(unittest.TestCase):
    np.testing.assert_allclose(sorted, target)

    # Check that untransform does the right thing.
    np.testing.assert_allclose(cdf_transformer.untransform(y_t), y)
    y_restored = cdf_transformer.untransform(y_t)
    assert np.max(y_restored - y) < 1e-5
    #np.testing.assert_allclose(y_restored, y)

  def test_clipping_X_transformer(self):
    """Test clipping transformer on X of singletask dataset."""
+135 −28
Original line number Diff line number Diff line
@@ -857,14 +857,18 @@ class BalancingTransformer(Transformer):
      task_y = task_y[task_w != 0]
      N_task = len(task_y)
      class_counts = []
      # Note that by definition of classes, num_c >= 1 for all classes
      # Note that we may 0 elements of a given class since we remove those
      # labels with zero weight. This typically happens in multitask datasets
      # where some datapoints only have labels for some tasks.
      for c in self.classes:
        # this works because task_y is 1D
        num_c = len(np.where(task_y == c)[0])
        class_counts.append(num_c)
      # This is the right ratio since N_task/num_c * num_c = N_task
      # for all classes
      class_weights = [N_task / float(num_c) for num_c in class_counts]
      class_weights = [
          N_task / float(num_c) if num_c > 0 else 0 for num_c in class_counts
      ]
      weights.append(class_weights)
    self.weights = weights

@@ -914,11 +918,35 @@ class BalancingTransformer(Transformer):


class CDFTransformer(Transformer):
  """Histograms the data and assigns values based on sorted list."""
  """Acts like a Cumulative Distribution Function (CDF)."""
  """Histograms the data and assigns values based on sorted list.
  
  def __init__(self, transform_X=False, transform_y=False, dataset=None,
  Acts like a Cumulative Distribution Function (CDF). If given a dataset of
  samples from a continuous distribution computes the CDF of this dataset.

  TODO: Add an example of this. The current documentation is confusing.
  """

  def __init__(self,
               transform_X=False,
               transform_y=False,
               transform_w=False,
               dataset=None,
               bins=2):
    """Initialize this transformer.

    Parameters
    ----------
    transform_X: bool, optional (default False)
      Whether to transform X
    transform_y: bool, optional (default False)
      Whether to transform y
    transform_w: bool, optional (default False)
      Whether to transform w
    dataset: dc.data.Dataset object, optional (default None)
      Dataset to be transformed
    bins: int, optional (default 2)

    """
    self.transform_X = transform_X
    self.transform_y = transform_y
    self.bins = bins
@@ -927,28 +955,63 @@ class CDFTransformer(Transformer):

  # TODO (flee2): for transform_y, figure out weights

  def transform(self, dataset, bins):
    """Performs CDF transform on data."""
    X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
  def transform_array(self, X, y, w):
    """Performs CDF transform on data.

    Parameters
    ----------
    X: np.ndarray
      Array of features
    y: np.ndarray
      Array of labels
    w: np.ndarray
      Array of weights.

    Returns
    -------
    Xtrans: np.ndarray
      Transformed array of features
    ytrans: np.ndarray
      Transformed array of labels
    wtrans: np.ndarray
      Transformed array of weights
    """
    w_t = w
    ids_t = ids
    if self.transform_X:
      X_t = get_cdf_values(X, self.bins)
      y_t = y
    if self.transform_y:
    elif self.transform_y:
      X_t = X
      y_t = get_cdf_values(y, self.bins)
      # print("y will not be transformed by CDFTransformer, for now.")
    return NumpyDataset(X_t, y_t, w_t, ids_t)
    return X_t, y_t, w_t

  def untransform(self, z):
    # print("Cannot undo CDF Transformer, for now.")
    """Undo transformation on provided data.

    Note that this transformation is only undone for y.

    Parameters
    ----------
    z: np.ndarray,
      Transformed y array
    """
    # Need this for transform_y
    if self.transform_y:
      return self.y
    else:
      raise NotImplementedError


def get_cdf_values(array, bins):
  """Helper function to compute CDF values.

  Parameters
  ----------
  array: np.ndarray
    Must be of shape `(n_rows, n_cols)`
  bins: int
    Number of bins to split data into.
  """
  # array = np.transpose(array)
  n_rows = array.shape[0]
  n_cols = array.shape[1]
@@ -970,18 +1033,63 @@ def get_cdf_values(array, bins):


class PowerTransformer(Transformer):
  """Takes power n transforms of the data based on an input vector."""
  """Takes power n transforms of the data based on an input vector.

  def __init__(self, transform_X=False, transform_y=False, powers=[1]):
  Computes the specified powers of the dataset. This can be useful if you're
  looking to add higher order features of the form `x_i^2`, `x_i^3` etc. to
  your dataset.
  """

  def __init__(self,
               transform_X=False,
               transform_y=False,
               transform_w=False,
               dataset=None,
               powers=[1]):
    """Initialize this transformer

    Parameters
    ----------
    transform_X: bool, optional (default False)
      Whether to transform X
    transform_y: bool, optional (default False)
      Whether to transform y
    transform_w: bool, optional (default False)
      Whether to transform w
    dataset: dc.data.Dataset object, optional (default None)
      Dataset to be transformed. Note that this argument is ignored since
      `PowerTransformer` doesn't require it to be specified.
    powers: list[int], optional (default `[1]`)
      The list of powers of features/labels to compute.
    """
    if transform_w:
      raise ValueError("PowerTransformer doesn't support w transformation.")
    self.transform_X = transform_X
    self.transform_y = transform_y
    self.powers = powers

  def transform(self, dataset):
    """Performs power transform on data."""
    X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
  def transform_array(self, X, y, w):
    """Performs power transform on data.

    Parameters
    ----------
    X: np.ndarray
      Array of features
    y: np.ndarray
      Array of labels
    w: np.ndarray
      Array of weights.

    Returns
    -------
    Xtrans: np.ndarray
      Transformed array of features
    ytrans: np.ndarray
      Transformed array of labels
    wtrans: np.ndarray
      Transformed array of weights
    """
    w_t = w
    ids_t = ids
    n_powers = len(self.powers)
    if self.transform_X:
      X_t = np.power(X, self.powers[0])
@@ -989,21 +1097,20 @@ class PowerTransformer(Transformer):
        X_t = np.hstack((X_t, np.power(X, self.powers[i])))
      y_t = y
    if self.transform_y:
      # print("y will not be transformed by PowerTransformer, for now.")
      y_t = np.power(y, self.powers[0])
      for i in range(1, n_powers):
        y_t = np.hstack((y_t, np.power(y, self.powers[i])))
      X_t = X
    """
    shutil.rmtree(dataset.data_dir)
    os.makedirs(dataset.data_dir)
    DiskDataset.from_numpy(dataset.data_dir, X_t, y_t, w_t, ids_t)
    return dataset
    """
    return NumpyDataset(X_t, y_t, w_t, ids_t)
    return (X_t, y_t, w_t)

  def untransform(self, z):
    # print("Cannot undo Power Transformer, for now.")
    """Undo transformation on provided data.

    Parameters
    ----------
    z: np.ndarray,
      Transformed y array
    """
    n_powers = len(self.powers)
    orig_len = (z.shape[1]) // n_powers
    z = z[:, :orig_len]