Merge pull request #2044 from deepchem/duplicate_balancing (59ddc9e6) · Commits · 钟慕尧 / deepchem

deepchem/data/datasets.py

+9 −7

Original line number	Diff line number	Diff line
		@@ -831,8 +831,9 @@ class NumpyDataset(Dataset):
		-------
		a newly constructed Dataset object
		"""
		newx, newy, neww = transformer.transform_array(self._X, self._y, self._w)
		return NumpyDataset(newx, newy, neww, self._ids[:])
		newx, newy, neww, newids = transformer.transform_array(
		self._X, self._y, self._w, self._ids)
		return NumpyDataset(newx, newy, neww, newids)

		def select(self, indices: Sequence[int],
		select_dir: str = None) -> "NumpyDataset":
		@@ -1402,8 +1403,8 @@ class DiskDataset(Dataset):
		for shard_num, row in self.metadata_df.iterrows():
		logger.info("Transforming shard %d/%d" % (shard_num, n_shards))
		X, y, w, ids = self.get_shard(shard_num)
		newx, newy, neww = transformer.transform_array(X, y, w)
		yield (newx, newy, neww, ids)
		newx, newy, neww, newids = transformer.transform_array(X, y, w, ids)
		yield (newx, newy, neww, newids)

		dataset = DiskDataset.create_dataset(
		generator(), data_dir=out_dir, tasks=tasks)
		@@ -1420,7 +1421,7 @@ class DiskDataset(Dataset):
		y = None if y_file is None else np.array(load_from_disk(y_file))
		w = None if w_file is None else np.array(load_from_disk(w_file))
		ids = np.array(load_from_disk(ids_file))
		X, y, w = transformer.transform_array(X, y, w)
		X, y, w, ids = transformer.transform_array(X, y, w, ids)
		basename = "shard-%d" % shard_num
		return DiskDataset.write_data_to_disk(out_dir, basename, tasks, X, y, w,
		ids)
		@@ -2150,8 +2151,9 @@ class ImageDataset(Dataset):
		-------
		a newly constructed Dataset object
		"""
		newx, newy, neww = transformer.transform_array(self.X, self.y, self.w)
		return NumpyDataset(newx, newy, neww, self.ids[:])
		newx, newy, neww, newids = transformer.transform_array(
		self.X, self.y, self.w, self.ids)
		return NumpyDataset(newx, newy, neww, newids)

		def select(self, indices: Sequence[int],
		select_dir: str = None) -> "ImageDataset":

deepchem/data/tests/test_datasets.py

+717 −690

Original line number	Diff line number	Diff line
		@@ -55,16 +55,34 @@ def load_multitask_data():

		class TestTransformer(dc.trans.Transformer):

		def transform_array(self, X, y, w):
		return (2 * X, 1.5 * y, w)
		def transform_array(self, X, y, w, ids):
		return (2 * X, 1.5 * y, w, ids)


		class TestDatasets(test_util.TensorFlowTestCase):
		"""
		Test basic top-level API for dataset objects.
		"""
		def test_transform_disk():
		"""Test that the transform() method works for DiskDatasets."""
		dataset = load_solubility_data()
		X = dataset.X
		y = dataset.y
		w = dataset.w
		ids = dataset.ids

		# Transform it

		def test_sparsify_and_densify(self):
		transformer = TestTransformer(transform_X=True, transform_y=True)
		for parallel in (True, False):
		transformed = dataset.transform(transformer, parallel=parallel)
		np.testing.assert_array_equal(X, dataset.X)
		np.testing.assert_array_equal(y, dataset.y)
		np.testing.assert_array_equal(w, dataset.w)
		np.testing.assert_array_equal(ids, dataset.ids)
		np.testing.assert_array_equal(2 * X, transformed.X)
		np.testing.assert_array_equal(1.5 * y, transformed.y)
		np.testing.assert_array_equal(w, transformed.w)
		np.testing.assert_array_equal(ids, transformed.ids)


		def test_sparsify_and_densify():
		"""Test that sparsify and densify work as inverses."""
		# Test on identity matrix
		num_samples = 10
		@@ -88,7 +106,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
		X_reconstructed = dc.data.densify_features(X_sparse, num_features)
		np.testing.assert_array_equal(X, X_reconstructed)

		def test_pad_features(self):

		def test_pad_features():
		"""Test that pad_features pads features correctly."""
		batch_size = 100
		num_features = 10
		@@ -133,7 +152,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
		X_out = dc.data.pad_features(batch_size, X_b)
		assert len(X_out) == batch_size

		def test_pad_batches(self):

		def test_pad_batches():
		"""Test that pad_batch pads batches correctly."""
		batch_size = 100
		num_features = 10
		@@ -205,7 +225,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
		ids_b)
		assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size

		def test_get_task_names(self):

		def test_get_task_names():
		"""Test that get_task_names returns correct task_names"""
		solubility_dataset = load_solubility_data()
		assert solubility_dataset.get_task_names() == ["log-solubility"]
		@@ -217,7 +238,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
		"task15", "task16"
		])

		def test_get_data_shape(self):

		def test_get_data_shape():
		"""Test that get_data_shape returns currect data shape"""
		solubility_dataset = load_solubility_data()
		assert solubility_dataset.get_data_shape() == (1024,)
		@@ -225,12 +247,14 @@ class TestDatasets(test_util.TensorFlowTestCase):
		multitask_dataset = load_multitask_data()
		assert multitask_dataset.get_data_shape() == (1024,)

		def test_len(self):

		def test_len():
		"""Test that len(dataset) works."""
		solubility_dataset = load_solubility_data()
		assert len(solubility_dataset) == 10

		def test_reshard(self):

		def test_reshard():
		"""Test that resharding the dataset works."""
		solubility_dataset = load_solubility_data()
		X, y, w, ids = (solubility_dataset.X, solubility_dataset.y,
		@@ -258,7 +282,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
		np.testing.assert_array_equal(w, w_rr)
		np.testing.assert_array_equal(ids, ids_rr)

		def test_select(self):

		def test_select():
		"""Test that dataset select works."""
		num_datapoints = 10
		num_features = 10
		@@ -278,7 +303,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
		np.testing.assert_array_equal(w[indices], w_sel)
		np.testing.assert_array_equal(ids[indices], ids_sel)

		def test_complete_shuffle(self):

		def test_complete_shuffle():
		shard_sizes = [1, 2, 3, 4, 5]
		batch_size = 10

		@@ -316,7 +342,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
		np.sort(dataset.w, axis=0), np.sort(res.w, axis=0))
		np.testing.assert_array_equal(np.sort(dataset.ids), np.sort(res.ids))

		def test_get_shape(self):

		def test_get_shape():
		"""Test that get_shape works."""
		num_datapoints = 100
		num_features = 10
		@@ -335,7 +362,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
		assert w_shape == w.shape
		assert ids_shape == ids.shape

		def test_iterbatches(self):

		def test_iterbatches():
		"""Test that iterating over batches of data works."""
		solubility_dataset = load_solubility_data()
		batch_size = 2
		@@ -347,7 +375,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
		assert w_b.shape == (batch_size,) + (len(tasks),)
		assert ids_b.shape == (batch_size,)

		def test_itersamples_numpy(self):

		def test_itersamples_numpy():
		"""Test that iterating over samples in a NumpyDataset works."""
		num_datapoints = 100
		num_features = 10
		@@ -364,7 +393,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
		np.testing.assert_array_equal(sw, w[i])
		np.testing.assert_array_equal(sid, ids[i])

		def test_itersamples_disk(self):

		def test_itersamples_disk():
		"""Test that iterating over samples in a DiskDataset works."""
		solubility_dataset = load_solubility_data()
		X = solubility_dataset.X
		@@ -377,7 +407,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
		np.testing.assert_array_equal(sw, w[i])
		np.testing.assert_array_equal(sid, ids[i])

		def test_transform_numpy(self):

		def test_transform_numpy():
		"""Test that the transform() method works for NumpyDatasets."""
		num_datapoints = 100
		num_features = 10
		@@ -403,29 +434,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
		np.testing.assert_array_equal(w, transformed.w)
		np.testing.assert_array_equal(ids, transformed.ids)

		def test_transform_disk(self):
		"""Test that the transform() method works for DiskDatasets."""
		dataset = load_solubility_data()
		X = dataset.X
		y = dataset.y
		w = dataset.w
		ids = dataset.ids

		# Transform it

		transformer = TestTransformer(transform_X=True, transform_y=True)
		for parallel in (True, False):
		transformed = dataset.transform(transformer, parallel=parallel)
		np.testing.assert_array_equal(X, dataset.X)
		np.testing.assert_array_equal(y, dataset.y)
		np.testing.assert_array_equal(w, dataset.w)
		np.testing.assert_array_equal(ids, dataset.ids)
		np.testing.assert_array_equal(2 * X, transformed.X)
		np.testing.assert_array_equal(1.5 * y, transformed.y)
		np.testing.assert_array_equal(w, transformed.w)
		np.testing.assert_array_equal(ids, transformed.ids)

		def test_to_numpy(self):
		def test_to_numpy():
		"""Test that transformation to numpy arrays is sensible."""
		solubility_dataset = load_solubility_data()
		data_shape = solubility_dataset.get_data_shape()
		@@ -440,7 +450,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
		assert w.shape == (N_samples, N_tasks)
		assert ids.shape == (N_samples,)

		def test_consistent_ordering(self):

		def test_consistent_ordering():
		"""Test that ordering of labels is consistent over time."""
		solubility_dataset = load_solubility_data()

		@@ -449,7 +460,8 @@ class TestDatasets(test_util.TensorFlowTestCase):

		assert np.array_equal(ids1, ids2)

		def test_get_statistics(self):

		def test_get_statistics():
		"""Test statistics computation of this dataset."""
		solubility_dataset = load_solubility_data()
		X, y, _, _ = (solubility_dataset.X, solubility_dataset.y,
		@@ -463,7 +475,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
		np.testing.assert_allclose(comp_X_stds, X_stds)
		np.testing.assert_allclose(comp_y_stds, y_stds)

		def test_disk_iterate_batch_size(self):

		def test_disk_iterate_batch_size():
		solubility_dataset = load_solubility_data()
		X, y, _, _ = (solubility_dataset.X, solubility_dataset.y,
		solubility_dataset.w, solubility_dataset.ids)
		@@ -471,9 +484,10 @@ class TestDatasets(test_util.TensorFlowTestCase):
		for X, y, _, _ in solubility_dataset.iterbatches(
		3, epochs=2, pad_batches=False, deterministic=True):
		batch_sizes.append(len(X))
		self.assertEqual([3, 3, 3, 1, 3, 3, 3, 1], batch_sizes)
		assert [3, 3, 3, 1, 3, 3, 3, 1] == batch_sizes

		def test_disk_pad_batches(self):

		def test_disk_pad_batches():
		shard_sizes = [21, 11, 41, 21, 51]
		batch_size = 10

		@@ -531,7 +545,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
		np.testing.assert_array_equal(all_ws, test_ws[:total_size, :])
		np.testing.assert_array_equal(all_ids, test_ids[:total_size])

		def test_disk_iterate_y_w_None(self):

		def test_disk_iterate_y_w_None():
		shard_sizes = [21, 11, 41, 21, 51]
		batch_size = 10

		@@ -575,7 +590,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
		np.testing.assert_array_equal(all_Xs, test_Xs[:total_size, :])
		np.testing.assert_array_equal(all_ids, test_ids[:total_size])

		def test_disk_iterate_batch(self):

		def test_disk_iterate_batch():

		all_batch_sizes = [None, 32, 17, 11]
		all_shard_sizes = [[7, 3, 12, 4, 5], [1, 1, 1, 1, 1], [31, 31, 31, 31, 31],
		@@ -688,19 +704,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
		np.testing.assert_array_equal(
		np.sort(all_ids, axis=0), np.sort(test_ids, axis=0))

		def test_numpy_iterate_batch_size(self):
		solubility_dataset = load_solubility_data()
		X, y, _, _ = (solubility_dataset.X, solubility_dataset.y,
		solubility_dataset.w, solubility_dataset.ids)
		solubility_dataset = dc.data.NumpyDataset.from_DiskDataset(
		solubility_dataset)
		batch_sizes = []
		for X, y, _, _ in solubility_dataset.iterbatches(
		3, epochs=2, pad_batches=False, deterministic=True):
		batch_sizes.append(len(X))
		self.assertEqual([3, 3, 3, 1, 3, 3, 3, 1], batch_sizes)

		def test_merge(self):
		def test_merge():
		"""Test that dataset merge works."""
		num_datapoints = 10
		num_features = 10
		@@ -722,7 +727,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
		assert new_data.y.shape == (num_datapoints * num_datasets, num_tasks)
		assert len(new_data.tasks) == len(datasets[0].tasks)

		def test_make_tf_dataset(self):

		def test_make_tf_dataset():
		"""Test creating a Tensorflow Iterator from a Dataset."""
		X = np.random.random((100, 5))
		y = np.random.random((100, 1))
		@@ -736,7 +742,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
		np.testing.assert_array_equal(np.ones((10, 1)), batch_w)
		assert i == 19

		def _validate_pytorch_dataset(self, dataset):

		def _validate_pytorch_dataset(dataset):
		X = dataset.X
		y = dataset.y
		w = dataset.w
		@@ -780,32 +787,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
		id_count[iter_id[0]] += 1
		assert all(id_count[id] == 2 for id in ids)

		@unittest.skipIf(PYTORCH_IMPORT_FAILED, 'PyTorch is not installed')
		def test_make_pytorch_dataset_from_numpy(self):
		"""Test creating a PyTorch Dataset from a NumpyDataset."""
		X = np.random.random((100, 5))
		y = np.random.random((100, 1))
		ids = [str(i) for i in range(100)]
		dataset = dc.data.NumpyDataset(X, y, ids=ids)
		self._validate_pytorch_dataset(dataset)

		@unittest.skipIf(PYTORCH_IMPORT_FAILED, 'PyTorch is not installed')
		def test_make_pytorch_dataset_from_images(self):
		"""Test creating a PyTorch Dataset from an ImageDataset."""
		path = os.path.join(os.path.dirname(__file__), 'images')
		files = [os.path.join(path, f) for f in os.listdir(path)]
		y = np.random.random((10, 1))
		ids = [str(i) for i in range(len(files))]
		dataset = dc.data.ImageDataset(files, y, ids=ids)
		self._validate_pytorch_dataset(dataset)

		@unittest.skipIf(PYTORCH_IMPORT_FAILED, 'PyTorch is not installed')
		def test_make_pytorch_dataset_from_disk(self):
		"""Test creating a PyTorch Dataset from a DiskDataset."""
		dataset = load_solubility_data()
		self._validate_pytorch_dataset(dataset)

		def test_dataframe(self):
		def test_dataframe():
		"""Test converting between Datasets and DataFrames."""
		dataset = load_solubility_data()

		@@ -827,7 +810,8 @@ class TestDatasets(test_util.TensorFlowTestCase):
		np.testing.assert_array_equal(
		np.stack([dataset.y[:, 0], dataset.X[:, 0]], axis=1), dataset3.w)

		def test_to_str(self):

		def test_to_str():
		"""Tests to string representation of Dataset."""
		dataset = dc.data.NumpyDataset(
		X=np.random.rand(5, 3), y=np.random.rand(5,), ids=np.arange(5))
		@@ -853,3 +837,46 @@ class TestDatasets(test_util.TensorFlowTestCase):
		X=np.random.rand(50, 3), y=np.random.rand(50,), ids=np.arange(50))
		ref_str = '<NumpyDataset X.shape: (50, 3), y.shape: (50,), w.shape: (50,), task_names: [0]>'
		assert str(dataset) == ref_str


		class TestDatasets(test_util.TensorFlowTestCase):
		"""
		Test basic top-level API for dataset objects.
		"""

		def test_numpy_iterate_batch_size(self):
		solubility_dataset = load_solubility_data()
		X, y, _, _ = (solubility_dataset.X, solubility_dataset.y,
		solubility_dataset.w, solubility_dataset.ids)
		solubility_dataset = dc.data.NumpyDataset.from_DiskDataset(
		solubility_dataset)
		batch_sizes = []
		for X, y, _, _ in solubility_dataset.iterbatches(
		3, epochs=2, pad_batches=False, deterministic=True):
		batch_sizes.append(len(X))
		self.assertEqual([3, 3, 3, 1, 3, 3, 3, 1], batch_sizes)

		@unittest.skipIf(PYTORCH_IMPORT_FAILED, 'PyTorch is not installed')
		def test_make_pytorch_dataset_from_numpy(self):
		"""Test creating a PyTorch Dataset from a NumpyDataset."""
		X = np.random.random((100, 5))
		y = np.random.random((100, 1))
		ids = [str(i) for i in range(100)]
		dataset = dc.data.NumpyDataset(X, y, ids=ids)
		_validate_pytorch_dataset(dataset)

		@unittest.skipIf(PYTORCH_IMPORT_FAILED, 'PyTorch is not installed')
		def test_make_pytorch_dataset_from_images(self):
		"""Test creating a PyTorch Dataset from an ImageDataset."""
		path = os.path.join(os.path.dirname(__file__), 'images')
		files = [os.path.join(path, f) for f in os.listdir(path)]
		y = np.random.random((10, 1))
		ids = [str(i) for i in range(len(files))]
		dataset = dc.data.ImageDataset(files, y, ids=ids)
		_validate_pytorch_dataset(dataset)

		@unittest.skipIf(PYTORCH_IMPORT_FAILED, 'PyTorch is not installed')
		def test_make_pytorch_dataset_from_disk(self):
		"""Test creating a PyTorch Dataset from a DiskDataset."""
		dataset = load_solubility_data()
		_validate_pytorch_dataset(dataset)

deepchem/models/fcnet.py

+2 −2

Original line number	Diff line number	Diff line
		@@ -395,7 +395,7 @@ class MultitaskFitTransformRegressor(MultitaskRegressor):
		for transformer in fit_transformers:
		assert transformer.transform_X and not (transformer.transform_y or
		transformer.transform_w)
		X_b, _, _ = transformer.transform_array(X_b, None, None)
		X_b, _, _, _ = transformer.transform_array(X_b, None, None, None)
		n_features = X_b.shape[1]
		logger.info("n_features after fit_transform: %d", int(n_features))
		super(MultitaskFitTransformRegressor, self).__init__(
		@@ -418,7 +418,7 @@ class MultitaskFitTransformRegressor(MultitaskRegressor):
		if X_b is not None:
		if mode == 'fit':
		for transformer in self.fit_transformers:
		X_b, _, _ = transformer.transform_array(X_b, None, None)
		X_b, _, _, _ = transformer.transform_array(X_b, None, None, None)
		if mode == 'predict':
		dropout = np.array(0.0)
		else:

deepchem/trans/init.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -19,3 +19,4 @@ from deepchem.trans.transformers import FeaturizationTransformer
		from deepchem.trans.transformers import ImageTransformer
		from deepchem.trans.transformers import DataTransforms
		from deepchem.trans.transformers import Transformer
		from deepchem.trans.duplicate import DuplicateBalancingTransformer

deepchem/trans/duplicate.py

0 → 100644

+173 −0

Original line number	Diff line number	Diff line
		import logging
		import numpy as np
		from typing import Tuple
		from deepchem.data import Dataset
		from deepchem.trans.transformers import Transformer

		logger = logging.getLogger(__name__)


		class DuplicateBalancingTransformer(Transformer):
		"""Balance binary or multiclass datasets by duplicating rarer class samples.

		This class balances a dataset by duplicating samples of the rarer class so
		that the sum of all example weights from all classes is the same. (Up to
		integer rounding of course). This can be useful when you're working on an
		imabalanced dataset where there are far fewer examples of some classes than
		others.

		This class differs from `BalancingTransformer` in that it actually
		duplicates rarer class samples rather than just increasing their sample
		weights. This may be more friendly for models that are numerically fragile
		and can't handle imbalanced example weights.

		Examples
		--------
		Here's an example for a binary dataset.

		>>> n_samples = 10
		>>> n_features = 3
		>>> n_tasks = 1
		>>> n_classes = 2
		>>> import deepchem as dc
		>>> import numpy as np
		>>> ids = np.arange(n_samples)
		>>> X = np.random.rand(n_samples, n_features)
		>>> y = np.random.randint(n_classes, size=(n_samples, n_tasks))
		>>> w = np.ones((n_samples, n_tasks))
		>>> dataset = dc.data.NumpyDataset(X, y, w, ids)
		>>> transformer = dc.trans.DuplicateBalancingTransformer(dataset=dataset)
		>>> dataset = transformer.transform(dataset)

		And here's a multiclass dataset example.

		>>> n_samples = 50
		>>> n_features = 3
		>>> n_tasks = 1
		>>> n_classes = 5
		>>> ids = np.arange(n_samples)
		>>> X = np.random.rand(n_samples, n_features)
		>>> y = np.random.randint(n_classes, size=(n_samples, n_tasks))
		>>> w = np.ones((n_samples, n_tasks))
		>>> dataset = dc.data.NumpyDataset(X, y, w, ids)
		>>> transformer = dc.trans.DuplicateBalancingTransformer(dataset=dataset)
		>>> dataset = transformer.transform(dataset)

		See Also
		--------
		deepchem.trans.BalancingTransformer: Balance by changing sample weights.

		Note
		----
		This transformer is only well-defined for singletask datasets. (Since
		examples are actually duplicated, there's no meaningful way to duplicate
		across multiple tasks in a way that preserves the balance.)

		This transformer is only meaningful for classification datasets where `y`
		takes on a limited set of values. This class transforms all of `X`, `y`,
		`w`, `ids`.

		Raises
		------
		`ValueError` if the provided dataset is multitask.
		"""

		def __init__(self, dataset: Dataset):
		super(DuplicateBalancingTransformer, self).__init__(
		transform_X=True,
		transform_y=True,
		transform_w=True,
		transform_ids=True,
		dataset=dataset)

		if len(dataset.get_task_names()) > 1:
		raise ValueError(
		"This transformation is only defined for singletask datsets.")

		# Get the labels/weights
		y = dataset.y
		w = dataset.w
		# Normalize shapes
		if len(y.shape) == 1:
		y = np.reshape(y, (len(y), 1))
		if len(w.shape) == 1:
		w = np.reshape(w, (len(w), 1))
		if len(y.shape) != 2:
		raise ValueError("y must be of shape (N,) or (N, n_tasks)")
		if len(w.shape) != 2:
		raise ValueError("w must be of shape (N,) or (N, n_tasks)")
		self.classes = sorted(np.unique(y))
		# Remove labels with zero weights
		y = y[w != 0]
		N = len(y)
		class_weights = []
		# Note that we may have 0 elements of a given class since we remove those
		# labels with zero weight.
		for c in self.classes:
		# this works because y is 1D
		c_weight = np.sum(w[y == c])
		class_weights.append(c_weight)
		weight_largest = max(class_weights)
		# This is the right ratio since int(N/num_c) * num_c \approx N
		# for all classes
		duplication_ratio = [
		int(weight_largest / float(c_weight)) if c_weight > 0 else 0
		for c_weight in class_weights
		]
		self.duplication_ratio = duplication_ratio

		def transform_array(
		self, X: np.ndarray, y: np.ndarray, w: np.ndarray,
		ids: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
		"""Transform the data in a set of (X, y, w, id) arrays.

		Parameters
		----------
		X: np.ndarray
		Array of features
		y: np.ndarray
		Array of labels
		w: np.ndarray
		Array of weights.
		ids: np.ndarray
		Array of identifiers

		Returns
		-------
		Xtrans: np.ndarray
		Transformed array of features
		ytrans: np.ndarray
		Transformed array of labels
		wtrans: np.ndarray
		Transformed array of weights
		idtrans: np.ndarray
		Transformed array of identifiers
		"""
		if not (len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1)):
		raise ValueError("y must be of shape (N,) or (N, 1)")
		if not (len(w.shape) == 1 or (len(w.shape) == 2 and w.shape[1] == 1)):
		raise ValueError("w must be of shape (N,) or (N, 1)")
		# Flattening is safe because of shape check above
		y = y.flatten()
		w = w.flatten()
		X_dups, y_dups, w_dups, ids_dups = [], [], [], []
		for i, c in enumerate(self.classes):
		duplication_ratio = self.duplication_ratio[i]
		c_inds = (y == c)
		X_c = X[c_inds]
		y_c = y[c_inds]
		w_c = w[c_inds]
		ids_c = ids[c_inds]
		X_c_dup = np.repeat(X_c, duplication_ratio, axis=0)
		y_c_dup = np.repeat(y_c, duplication_ratio, axis=0)
		w_c_dup = np.repeat(w_c, duplication_ratio, axis=0)
		ids_c_dup = np.repeat(ids_c, duplication_ratio, axis=0)
		X_dups.append(X_c_dup)
		y_dups.append(y_c_dup)
		w_dups.append(w_c_dup)
		ids_dups.append(ids_c_dup)
		Xtrans = np.concatenate(X_dups, axis=0)
		ytrans = np.concatenate(y_dups, axis=0)
		wtrans = np.concatenate(w_dups, axis=0)
		idstrans = np.concatenate(ids_dups, axis=0)
		return (Xtrans, ytrans, wtrans, idstrans)

Admin message