Unverified Commit 59ddc9e6 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #2044 from deepchem/duplicate_balancing

Duplicate Balancing Transformer
parents f34c9ce3 1c9474dd
Loading
Loading
Loading
Loading
+9 −7
Original line number Diff line number Diff line
@@ -831,8 +831,9 @@ class NumpyDataset(Dataset):
    -------
    a newly constructed Dataset object
    """
    newx, newy, neww = transformer.transform_array(self._X, self._y, self._w)
    return NumpyDataset(newx, newy, neww, self._ids[:])
    newx, newy, neww, newids = transformer.transform_array(
        self._X, self._y, self._w, self._ids)
    return NumpyDataset(newx, newy, neww, newids)

  def select(self, indices: Sequence[int],
             select_dir: str = None) -> "NumpyDataset":
@@ -1402,8 +1403,8 @@ class DiskDataset(Dataset):
        for shard_num, row in self.metadata_df.iterrows():
          logger.info("Transforming shard %d/%d" % (shard_num, n_shards))
          X, y, w, ids = self.get_shard(shard_num)
          newx, newy, neww = transformer.transform_array(X, y, w)
          yield (newx, newy, neww, ids)
          newx, newy, neww, newids = transformer.transform_array(X, y, w, ids)
          yield (newx, newy, neww, newids)

      dataset = DiskDataset.create_dataset(
          generator(), data_dir=out_dir, tasks=tasks)
@@ -1420,7 +1421,7 @@ class DiskDataset(Dataset):
    y = None if y_file is None else np.array(load_from_disk(y_file))
    w = None if w_file is None else np.array(load_from_disk(w_file))
    ids = np.array(load_from_disk(ids_file))
    X, y, w = transformer.transform_array(X, y, w)
    X, y, w, ids = transformer.transform_array(X, y, w, ids)
    basename = "shard-%d" % shard_num
    return DiskDataset.write_data_to_disk(out_dir, basename, tasks, X, y, w,
                                          ids)
@@ -2150,8 +2151,9 @@ class ImageDataset(Dataset):
    -------
    a newly constructed Dataset object
    """
    newx, newy, neww = transformer.transform_array(self.X, self.y, self.w)
    return NumpyDataset(newx, newy, neww, self.ids[:])
    newx, newy, neww, newids = transformer.transform_array(
        self.X, self.y, self.w, self.ids)
    return NumpyDataset(newx, newy, neww, newids)

  def select(self, indices: Sequence[int],
             select_dir: str = None) -> "ImageDataset":
+717 −690
Original line number Diff line number Diff line
@@ -55,16 +55,34 @@ def load_multitask_data():

class TestTransformer(dc.trans.Transformer):

  def transform_array(self, X, y, w):
    return (2 * X, 1.5 * y, w)
  def transform_array(self, X, y, w, ids):
    return (2 * X, 1.5 * y, w, ids)


class TestDatasets(test_util.TensorFlowTestCase):
  """
  Test basic top-level API for dataset objects.
  """
def test_transform_disk():
  """Test that the transform() method works for DiskDatasets."""
  dataset = load_solubility_data()
  X = dataset.X
  y = dataset.y
  w = dataset.w
  ids = dataset.ids

  # Transform it

  def test_sparsify_and_densify(self):
  transformer = TestTransformer(transform_X=True, transform_y=True)
  for parallel in (True, False):
    transformed = dataset.transform(transformer, parallel=parallel)
    np.testing.assert_array_equal(X, dataset.X)
    np.testing.assert_array_equal(y, dataset.y)
    np.testing.assert_array_equal(w, dataset.w)
    np.testing.assert_array_equal(ids, dataset.ids)
    np.testing.assert_array_equal(2 * X, transformed.X)
    np.testing.assert_array_equal(1.5 * y, transformed.y)
    np.testing.assert_array_equal(w, transformed.w)
    np.testing.assert_array_equal(ids, transformed.ids)


def test_sparsify_and_densify():
  """Test that sparsify and densify work as inverses."""
  # Test on identity matrix
  num_samples = 10
@@ -88,7 +106,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
  X_reconstructed = dc.data.densify_features(X_sparse, num_features)
  np.testing.assert_array_equal(X, X_reconstructed)

  def test_pad_features(self):

def test_pad_features():
  """Test that pad_features pads features correctly."""
  batch_size = 100
  num_features = 10
@@ -133,7 +152,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
  X_out = dc.data.pad_features(batch_size, X_b)
  assert len(X_out) == batch_size

  def test_pad_batches(self):

def test_pad_batches():
  """Test that pad_batch pads batches correctly."""
  batch_size = 100
  num_features = 10
@@ -205,7 +225,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
                                                   ids_b)
  assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size

  def test_get_task_names(self):

def test_get_task_names():
  """Test that get_task_names returns correct task_names"""
  solubility_dataset = load_solubility_data()
  assert solubility_dataset.get_task_names() == ["log-solubility"]
@@ -217,7 +238,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
      "task15", "task16"
  ])

  def test_get_data_shape(self):

def test_get_data_shape():
  """Test that get_data_shape returns currect data shape"""
  solubility_dataset = load_solubility_data()
  assert solubility_dataset.get_data_shape() == (1024,)
@@ -225,12 +247,14 @@ class TestDatasets(test_util.TensorFlowTestCase):
  multitask_dataset = load_multitask_data()
  assert multitask_dataset.get_data_shape() == (1024,)

  def test_len(self):

def test_len():
  """Test that len(dataset) works."""
  solubility_dataset = load_solubility_data()
  assert len(solubility_dataset) == 10

  def test_reshard(self):

def test_reshard():
  """Test that resharding the dataset works."""
  solubility_dataset = load_solubility_data()
  X, y, w, ids = (solubility_dataset.X, solubility_dataset.y,
@@ -258,7 +282,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
  np.testing.assert_array_equal(w, w_rr)
  np.testing.assert_array_equal(ids, ids_rr)

  def test_select(self):

def test_select():
  """Test that dataset select works."""
  num_datapoints = 10
  num_features = 10
@@ -278,7 +303,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
  np.testing.assert_array_equal(w[indices], w_sel)
  np.testing.assert_array_equal(ids[indices], ids_sel)

  def test_complete_shuffle(self):

def test_complete_shuffle():
  shard_sizes = [1, 2, 3, 4, 5]
  batch_size = 10

@@ -316,7 +342,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
      np.sort(dataset.w, axis=0), np.sort(res.w, axis=0))
  np.testing.assert_array_equal(np.sort(dataset.ids), np.sort(res.ids))

  def test_get_shape(self):

def test_get_shape():
  """Test that get_shape works."""
  num_datapoints = 100
  num_features = 10
@@ -335,7 +362,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
  assert w_shape == w.shape
  assert ids_shape == ids.shape

  def test_iterbatches(self):

def test_iterbatches():
  """Test that iterating over batches of data works."""
  solubility_dataset = load_solubility_data()
  batch_size = 2
@@ -347,7 +375,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
    assert w_b.shape == (batch_size,) + (len(tasks),)
    assert ids_b.shape == (batch_size,)

  def test_itersamples_numpy(self):

def test_itersamples_numpy():
  """Test that iterating over samples in a NumpyDataset works."""
  num_datapoints = 100
  num_features = 10
@@ -364,7 +393,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
    np.testing.assert_array_equal(sw, w[i])
    np.testing.assert_array_equal(sid, ids[i])

  def test_itersamples_disk(self):

def test_itersamples_disk():
  """Test that iterating over samples in a DiskDataset works."""
  solubility_dataset = load_solubility_data()
  X = solubility_dataset.X
@@ -377,7 +407,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
    np.testing.assert_array_equal(sw, w[i])
    np.testing.assert_array_equal(sid, ids[i])

  def test_transform_numpy(self):

def test_transform_numpy():
  """Test that the transform() method works for NumpyDatasets."""
  num_datapoints = 100
  num_features = 10
@@ -403,29 +434,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
  np.testing.assert_array_equal(w, transformed.w)
  np.testing.assert_array_equal(ids, transformed.ids)

  def test_transform_disk(self):
    """Test that the transform() method works for DiskDatasets."""
    dataset = load_solubility_data()
    X = dataset.X
    y = dataset.y
    w = dataset.w
    ids = dataset.ids

    # Transform it

    transformer = TestTransformer(transform_X=True, transform_y=True)
    for parallel in (True, False):
      transformed = dataset.transform(transformer, parallel=parallel)
      np.testing.assert_array_equal(X, dataset.X)
      np.testing.assert_array_equal(y, dataset.y)
      np.testing.assert_array_equal(w, dataset.w)
      np.testing.assert_array_equal(ids, dataset.ids)
      np.testing.assert_array_equal(2 * X, transformed.X)
      np.testing.assert_array_equal(1.5 * y, transformed.y)
      np.testing.assert_array_equal(w, transformed.w)
      np.testing.assert_array_equal(ids, transformed.ids)

  def test_to_numpy(self):
def test_to_numpy():
  """Test that transformation to numpy arrays is sensible."""
  solubility_dataset = load_solubility_data()
  data_shape = solubility_dataset.get_data_shape()
@@ -440,7 +450,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
  assert w.shape == (N_samples, N_tasks)
  assert ids.shape == (N_samples,)

  def test_consistent_ordering(self):

def test_consistent_ordering():
  """Test that ordering of labels is consistent over time."""
  solubility_dataset = load_solubility_data()

@@ -449,7 +460,8 @@ class TestDatasets(test_util.TensorFlowTestCase):

  assert np.array_equal(ids1, ids2)

  def test_get_statistics(self):

def test_get_statistics():
  """Test statistics computation of this dataset."""
  solubility_dataset = load_solubility_data()
  X, y, _, _ = (solubility_dataset.X, solubility_dataset.y,
@@ -463,7 +475,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
  np.testing.assert_allclose(comp_X_stds, X_stds)
  np.testing.assert_allclose(comp_y_stds, y_stds)

  def test_disk_iterate_batch_size(self):

def test_disk_iterate_batch_size():
  solubility_dataset = load_solubility_data()
  X, y, _, _ = (solubility_dataset.X, solubility_dataset.y,
                solubility_dataset.w, solubility_dataset.ids)
@@ -471,9 +484,10 @@ class TestDatasets(test_util.TensorFlowTestCase):
  for X, y, _, _ in solubility_dataset.iterbatches(
      3, epochs=2, pad_batches=False, deterministic=True):
    batch_sizes.append(len(X))
    self.assertEqual([3, 3, 3, 1, 3, 3, 3, 1], batch_sizes)
  assert [3, 3, 3, 1, 3, 3, 3, 1] == batch_sizes

  def test_disk_pad_batches(self):

def test_disk_pad_batches():
  shard_sizes = [21, 11, 41, 21, 51]
  batch_size = 10

@@ -531,7 +545,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
  np.testing.assert_array_equal(all_ws, test_ws[:total_size, :])
  np.testing.assert_array_equal(all_ids, test_ids[:total_size])

  def test_disk_iterate_y_w_None(self):

def test_disk_iterate_y_w_None():
  shard_sizes = [21, 11, 41, 21, 51]
  batch_size = 10

@@ -575,7 +590,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
  np.testing.assert_array_equal(all_Xs, test_Xs[:total_size, :])
  np.testing.assert_array_equal(all_ids, test_ids[:total_size])

  def test_disk_iterate_batch(self):

def test_disk_iterate_batch():

  all_batch_sizes = [None, 32, 17, 11]
  all_shard_sizes = [[7, 3, 12, 4, 5], [1, 1, 1, 1, 1], [31, 31, 31, 31, 31],
@@ -688,19 +704,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
    np.testing.assert_array_equal(
        np.sort(all_ids, axis=0), np.sort(test_ids, axis=0))

  def test_numpy_iterate_batch_size(self):
    solubility_dataset = load_solubility_data()
    X, y, _, _ = (solubility_dataset.X, solubility_dataset.y,
                  solubility_dataset.w, solubility_dataset.ids)
    solubility_dataset = dc.data.NumpyDataset.from_DiskDataset(
        solubility_dataset)
    batch_sizes = []
    for X, y, _, _ in solubility_dataset.iterbatches(
        3, epochs=2, pad_batches=False, deterministic=True):
      batch_sizes.append(len(X))
    self.assertEqual([3, 3, 3, 1, 3, 3, 3, 1], batch_sizes)

  def test_merge(self):
def test_merge():
  """Test that dataset merge works."""
  num_datapoints = 10
  num_features = 10
@@ -722,7 +727,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
  assert new_data.y.shape == (num_datapoints * num_datasets, num_tasks)
  assert len(new_data.tasks) == len(datasets[0].tasks)

  def test_make_tf_dataset(self):

def test_make_tf_dataset():
  """Test creating a Tensorflow Iterator from a Dataset."""
  X = np.random.random((100, 5))
  y = np.random.random((100, 1))
@@ -736,7 +742,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
    np.testing.assert_array_equal(np.ones((10, 1)), batch_w)
  assert i == 19

  def _validate_pytorch_dataset(self, dataset):

def _validate_pytorch_dataset(dataset):
  X = dataset.X
  y = dataset.y
  w = dataset.w
@@ -780,32 +787,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
    id_count[iter_id[0]] += 1
  assert all(id_count[id] == 2 for id in ids)

  @unittest.skipIf(PYTORCH_IMPORT_FAILED, 'PyTorch is not installed')
  def test_make_pytorch_dataset_from_numpy(self):
    """Test creating a PyTorch Dataset from a NumpyDataset."""
    X = np.random.random((100, 5))
    y = np.random.random((100, 1))
    ids = [str(i) for i in range(100)]
    dataset = dc.data.NumpyDataset(X, y, ids=ids)
    self._validate_pytorch_dataset(dataset)

  @unittest.skipIf(PYTORCH_IMPORT_FAILED, 'PyTorch is not installed')
  def test_make_pytorch_dataset_from_images(self):
    """Test creating a PyTorch Dataset from an ImageDataset."""
    path = os.path.join(os.path.dirname(__file__), 'images')
    files = [os.path.join(path, f) for f in os.listdir(path)]
    y = np.random.random((10, 1))
    ids = [str(i) for i in range(len(files))]
    dataset = dc.data.ImageDataset(files, y, ids=ids)
    self._validate_pytorch_dataset(dataset)

  @unittest.skipIf(PYTORCH_IMPORT_FAILED, 'PyTorch is not installed')
  def test_make_pytorch_dataset_from_disk(self):
    """Test creating a PyTorch Dataset from a DiskDataset."""
    dataset = load_solubility_data()
    self._validate_pytorch_dataset(dataset)

  def test_dataframe(self):
def test_dataframe():
  """Test converting between Datasets and DataFrames."""
  dataset = load_solubility_data()

@@ -827,7 +810,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
  np.testing.assert_array_equal(
      np.stack([dataset.y[:, 0], dataset.X[:, 0]], axis=1), dataset3.w)

  def test_to_str(self):

def test_to_str():
  """Tests to string representation of Dataset."""
  dataset = dc.data.NumpyDataset(
      X=np.random.rand(5, 3), y=np.random.rand(5,), ids=np.arange(5))
@@ -853,3 +837,46 @@ class TestDatasets(test_util.TensorFlowTestCase):
      X=np.random.rand(50, 3), y=np.random.rand(50,), ids=np.arange(50))
  ref_str = '<NumpyDataset X.shape: (50, 3), y.shape: (50,), w.shape: (50,), task_names: [0]>'
  assert str(dataset) == ref_str


class TestDatasets(test_util.TensorFlowTestCase):
  """
  Test basic top-level API for dataset objects.
  """

  def test_numpy_iterate_batch_size(self):
    solubility_dataset = load_solubility_data()
    X, y, _, _ = (solubility_dataset.X, solubility_dataset.y,
                  solubility_dataset.w, solubility_dataset.ids)
    solubility_dataset = dc.data.NumpyDataset.from_DiskDataset(
        solubility_dataset)
    batch_sizes = []
    for X, y, _, _ in solubility_dataset.iterbatches(
        3, epochs=2, pad_batches=False, deterministic=True):
      batch_sizes.append(len(X))
    self.assertEqual([3, 3, 3, 1, 3, 3, 3, 1], batch_sizes)

  @unittest.skipIf(PYTORCH_IMPORT_FAILED, 'PyTorch is not installed')
  def test_make_pytorch_dataset_from_numpy(self):
    """Test creating a PyTorch Dataset from a NumpyDataset."""
    X = np.random.random((100, 5))
    y = np.random.random((100, 1))
    ids = [str(i) for i in range(100)]
    dataset = dc.data.NumpyDataset(X, y, ids=ids)
    _validate_pytorch_dataset(dataset)

  @unittest.skipIf(PYTORCH_IMPORT_FAILED, 'PyTorch is not installed')
  def test_make_pytorch_dataset_from_images(self):
    """Test creating a PyTorch Dataset from an ImageDataset."""
    path = os.path.join(os.path.dirname(__file__), 'images')
    files = [os.path.join(path, f) for f in os.listdir(path)]
    y = np.random.random((10, 1))
    ids = [str(i) for i in range(len(files))]
    dataset = dc.data.ImageDataset(files, y, ids=ids)
    _validate_pytorch_dataset(dataset)

  @unittest.skipIf(PYTORCH_IMPORT_FAILED, 'PyTorch is not installed')
  def test_make_pytorch_dataset_from_disk(self):
    """Test creating a PyTorch Dataset from a DiskDataset."""
    dataset = load_solubility_data()
    _validate_pytorch_dataset(dataset)
+2 −2
Original line number Diff line number Diff line
@@ -395,7 +395,7 @@ class MultitaskFitTransformRegressor(MultitaskRegressor):
    for transformer in fit_transformers:
      assert transformer.transform_X and not (transformer.transform_y or
                                              transformer.transform_w)
      X_b, _, _ = transformer.transform_array(X_b, None, None)
      X_b, _, _, _ = transformer.transform_array(X_b, None, None, None)
    n_features = X_b.shape[1]
    logger.info("n_features after fit_transform: %d", int(n_features))
    super(MultitaskFitTransformRegressor, self).__init__(
@@ -418,7 +418,7 @@ class MultitaskFitTransformRegressor(MultitaskRegressor):
        if X_b is not None:
          if mode == 'fit':
            for transformer in self.fit_transformers:
              X_b, _, _ = transformer.transform_array(X_b, None, None)
              X_b, _, _, _ = transformer.transform_array(X_b, None, None, None)
        if mode == 'predict':
          dropout = np.array(0.0)
        else:
+1 −0
Original line number Diff line number Diff line
@@ -19,3 +19,4 @@ from deepchem.trans.transformers import FeaturizationTransformer
from deepchem.trans.transformers import ImageTransformer
from deepchem.trans.transformers import DataTransforms
from deepchem.trans.transformers import Transformer
from deepchem.trans.duplicate import DuplicateBalancingTransformer
+173 −0
Original line number Diff line number Diff line
import logging
import numpy as np
from typing import Tuple
from deepchem.data import Dataset
from deepchem.trans.transformers import Transformer

logger = logging.getLogger(__name__)


class DuplicateBalancingTransformer(Transformer):
  """Balance binary or multiclass datasets by duplicating rarer class samples.

  This class balances a dataset by duplicating samples of the rarer class so
  that the sum of all example weights from all classes is the same. (Up to
  integer rounding of course). This can be useful when you're working on an
  imabalanced dataset where there are far fewer examples of some classes than
  others. 

  This class differs from `BalancingTransformer` in that it actually
  duplicates rarer class samples rather than just increasing their sample
  weights. This may be more friendly for models that are numerically fragile
  and can't handle imbalanced example weights.

  Examples
  --------
  Here's an example for a binary dataset.

  >>> n_samples = 10
  >>> n_features = 3
  >>> n_tasks = 1
  >>> n_classes = 2
  >>> import deepchem as dc
  >>> import numpy as np
  >>> ids = np.arange(n_samples)
  >>> X = np.random.rand(n_samples, n_features)
  >>> y = np.random.randint(n_classes, size=(n_samples, n_tasks))
  >>> w = np.ones((n_samples, n_tasks))
  >>> dataset = dc.data.NumpyDataset(X, y, w, ids)
  >>> transformer = dc.trans.DuplicateBalancingTransformer(dataset=dataset)
  >>> dataset = transformer.transform(dataset)

  And here's a multiclass dataset example.

  >>> n_samples = 50
  >>> n_features = 3
  >>> n_tasks = 1
  >>> n_classes = 5
  >>> ids = np.arange(n_samples)
  >>> X = np.random.rand(n_samples, n_features)
  >>> y = np.random.randint(n_classes, size=(n_samples, n_tasks))
  >>> w = np.ones((n_samples, n_tasks))
  >>> dataset = dc.data.NumpyDataset(X, y, w, ids)
  >>> transformer = dc.trans.DuplicateBalancingTransformer(dataset=dataset)
  >>> dataset = transformer.transform(dataset)

  See Also
  --------
  deepchem.trans.BalancingTransformer: Balance by changing sample weights.
  
  Note
  ----
  This transformer is only well-defined for singletask datasets. (Since
  examples are actually duplicated, there's no meaningful way to duplicate
  across multiple tasks in a way that preserves the balance.) 

  This transformer is only meaningful for classification datasets where `y`
  takes on a limited set of values. This class transforms all of `X`, `y`,
  `w`, `ids`.

  Raises
  ------
  `ValueError` if the provided dataset is multitask.
  """

  def __init__(self, dataset: Dataset):
    super(DuplicateBalancingTransformer, self).__init__(
        transform_X=True,
        transform_y=True,
        transform_w=True,
        transform_ids=True,
        dataset=dataset)

    if len(dataset.get_task_names()) > 1:
      raise ValueError(
          "This transformation is only defined for singletask datsets.")

    # Get the labels/weights
    y = dataset.y
    w = dataset.w
    # Normalize shapes
    if len(y.shape) == 1:
      y = np.reshape(y, (len(y), 1))
    if len(w.shape) == 1:
      w = np.reshape(w, (len(w), 1))
    if len(y.shape) != 2:
      raise ValueError("y must be of shape (N,) or (N, n_tasks)")
    if len(w.shape) != 2:
      raise ValueError("w must be of shape (N,) or (N, n_tasks)")
    self.classes = sorted(np.unique(y))
    # Remove labels with zero weights
    y = y[w != 0]
    N = len(y)
    class_weights = []
    # Note that we may have 0 elements of a given class since we remove those
    # labels with zero weight.
    for c in self.classes:
      # this works because y is 1D
      c_weight = np.sum(w[y == c])
      class_weights.append(c_weight)
    weight_largest = max(class_weights)
    # This is the right ratio since int(N/num_c) * num_c \approx N
    # for all classes
    duplication_ratio = [
        int(weight_largest / float(c_weight)) if c_weight > 0 else 0
        for c_weight in class_weights
    ]
    self.duplication_ratio = duplication_ratio

  def transform_array(
      self, X: np.ndarray, y: np.ndarray, w: np.ndarray,
      ids: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Transform the data in a set of (X, y, w, id) arrays.

    Parameters
    ----------
    X: np.ndarray
      Array of features
    y: np.ndarray
      Array of labels
    w: np.ndarray
      Array of weights.
    ids: np.ndarray
      Array of identifiers

    Returns
    -------
    Xtrans: np.ndarray
      Transformed array of features
    ytrans: np.ndarray
      Transformed array of labels
    wtrans: np.ndarray
      Transformed array of weights
    idtrans: np.ndarray
      Transformed array of identifiers
    """
    if not (len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1)):
      raise ValueError("y must be of shape (N,) or (N, 1)")
    if not (len(w.shape) == 1 or (len(w.shape) == 2 and w.shape[1] == 1)):
      raise ValueError("w must be of shape (N,) or (N, 1)")
    # Flattening is safe because of shape check above
    y = y.flatten()
    w = w.flatten()
    X_dups, y_dups, w_dups, ids_dups = [], [], [], []
    for i, c in enumerate(self.classes):
      duplication_ratio = self.duplication_ratio[i]
      c_inds = (y == c)
      X_c = X[c_inds]
      y_c = y[c_inds]
      w_c = w[c_inds]
      ids_c = ids[c_inds]
      X_c_dup = np.repeat(X_c, duplication_ratio, axis=0)
      y_c_dup = np.repeat(y_c, duplication_ratio, axis=0)
      w_c_dup = np.repeat(w_c, duplication_ratio, axis=0)
      ids_c_dup = np.repeat(ids_c, duplication_ratio, axis=0)
      X_dups.append(X_c_dup)
      y_dups.append(y_c_dup)
      w_dups.append(w_c_dup)
      ids_dups.append(ids_c_dup)
    Xtrans = np.concatenate(X_dups, axis=0)
    ytrans = np.concatenate(y_dups, axis=0)
    wtrans = np.concatenate(w_dups, axis=0)
    idstrans = np.concatenate(ids_dups, axis=0)
    return (Xtrans, ytrans, wtrans, idstrans)
Loading