change (afad4301) · Commits · 钟慕尧 / deepchem

deepchem/data/datasets.py

+37 −3

Original line number	Diff line number	Diff line
		@@ -1655,6 +1655,36 @@ class DiskDataset(Dataset):
		return np.array(
		load_from_disk(os.path.join(self.data_dir, row['ids'])), dtype=object)

		def get_shard_y(self, i):
		"""Retrieves the labels for the i-th shard from disk.

		Parameters
		----------
		i: int
		Shard index for shard to retrieve labels from
		"""

		if self._cached_shards is not None and self._cached_shards[i] is not None:
		return self._cached_shards[i].y
		row = self.metadata_df.iloc[i]
		return np.array(
		load_from_disk(os.path.join(self.data_dir, row['y'])), dtype=object)

		def get_shard_w(self, i):
		"""Retrieves the weights for the i-th shard from disk.

		Parameters
		----------
		i: int
		Shard index for shard to retrieve weights from
		"""

		if self._cached_shards is not None and self._cached_shards[i] is not None:
		return self._cached_shards[i].w
		row = self.metadata_df.iloc[i]
		return np.array(
		load_from_disk(os.path.join(self.data_dir, row['w'])), dtype=object)

		def add_shard(self, X, y, w, ids):
		"""Adds a data shard."""
		metadata_rows = self.metadata_df.values.tolist()
		@@ -1758,9 +1788,12 @@ class DiskDataset(Dataset):
		@property
		def y(self):
		"""Get the y vector for this dataset as a single numpy array."""
		if len(self) == 0:
		return np.array([])
		ys = []
		one_dimensional = False
		for (_, y_b, _, _) in self.itershards():
		for i in range(self.get_number_shards()):
		y_b = self.get_shard_y(i)
		ys.append(y_b)
		if len(y_b.shape) == 1:
		one_dimensional = True
		@@ -1774,8 +1807,9 @@ class DiskDataset(Dataset):
		"""Get the weight vector for this dataset as a single numpy array."""
		ws = []
		one_dimensional = False
		for (_, _, w_b, _) in self.itershards():
		ws.append(np.array(w_b))
		for i in range(self.get_number_shards()):
		w_b = self.get_shard_w(i)
		ws.append(w_b)
		if len(w_b.shape) == 1:
		one_dimensional = True
		if not one_dimensional:

deepchem/data/tests/test_property.py

0 → 100644

+30 −0

Original line number	Diff line number	Diff line
		import numpy as np
		import deepchem as dc


		def test_y_property():
		"""Test that dataset.y works."""
		num_datapoints = 10
		num_features = 10
		num_tasks = 1
		X = np.random.rand(num_datapoints, num_features)
		y = np.random.randint(2, size=(num_datapoints, num_tasks))
		w = np.ones((num_datapoints, num_tasks))
		ids = np.array(["id"] * num_datapoints)
		dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids)
		y_out = dataset.y
		np.testing.assert_array_equal(y, y_out)


		def test_w_property():
		"""Test that dataset.y works."""
		num_datapoints = 10
		num_features = 10
		num_tasks = 1
		X = np.random.rand(num_datapoints, num_features)
		y = np.random.randint(2, size=(num_datapoints, num_tasks))
		w = np.ones((num_datapoints, num_tasks))
		ids = np.array(["id"] * num_datapoints)
		dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids)
		w_out = dataset.w
		np.testing.assert_array_equal(w, w_out)

deepchem/trans/tests/test_balancing.py

0 → 100644

+148 −0

Original line number	Diff line number	Diff line
		import numpy as np
		import unittest
		import deepchem as dc
		import itertools
		import os


		class TestBalancingTransformer(unittest.TestCase):
		"""
		Test top-level API for transformer objects.
		"""

		def test_binary_1d(self):
		"""Test balancing transformer on single-task dataset without explicit task dimension."""
		n_samples = 20
		n_features = 3
		n_classes = 2
		np.random.seed(123)
		ids = np.arange(n_samples)
		X = np.random.rand(n_samples, n_features)
		y = np.random.randint(n_classes, size=(n_samples,))
		w = np.ones((n_samples,))
		dataset = dc.data.NumpyDataset(X, y, w)

		balancing_transformer = dc.trans.BalancingTransformer(
		transform_w=True, dataset=dataset)
		dataset = balancing_transformer.transform(dataset)
		X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
		# Check ids are unchanged.
		for id_elt, id_t_elt in zip(ids, ids_t):
		assert id_elt == id_t_elt
		# Check X is unchanged since this is a w transformer
		np.testing.assert_allclose(X, X_t)
		# Check y is unchanged since this is a w transformer
		np.testing.assert_allclose(y, y_t)
		y_task = y_t
		w_task = w_t
		w_orig_task = w
		# Assert that entries with zero weight retain zero weight
		np.testing.assert_allclose(w_task[w_orig_task == 0],
		np.zeros_like(w_task[w_orig_task == 0]))
		# Check that sum of 0s equals sum of 1s in transformed for each task
		assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))

		def test_binary_singletask(self):
		"""Test balancing transformer on single-task dataset."""
		n_samples = 20
		n_features = 3
		n_tasks = 1
		n_classes = 2
		np.random.seed(123)
		ids = np.arange(n_samples)
		X = np.random.rand(n_samples, n_features)
		y = np.random.randint(n_classes, size=(n_samples, n_tasks))
		w = np.ones((n_samples, n_tasks))
		dataset = dc.data.NumpyDataset(X, y, w)

		balancing_transformer = dc.trans.BalancingTransformer(
		transform_w=True, dataset=dataset)
		dataset = balancing_transformer.transform(dataset)
		X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
		# Check ids are unchanged.
		for id_elt, id_t_elt in zip(ids, ids_t):
		assert id_elt == id_t_elt
		# Check X is unchanged since this is a w transformer
		np.testing.assert_allclose(X, X_t)
		# Check y is unchanged since this is a w transformer
		np.testing.assert_allclose(y, y_t)
		for ind, task in enumerate(dataset.get_task_names()):
		y_task = y_t[:, ind]
		w_task = w_t[:, ind]
		w_orig_task = w[:, ind]
		# Assert that entries with zero weight retain zero weight
		np.testing.assert_allclose(w_task[w_orig_task == 0],
		np.zeros_like(w_task[w_orig_task == 0]))
		# Check that sum of 0s equals sum of 1s in transformed for each task
		assert np.isclose(
		np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))

		def test_binary_multitask(self):
		"""Test balancing transformer on multitask dataset."""
		n_samples = 10
		n_features = 3
		n_tasks = 5
		n_classes = 2
		ids = np.arange(n_samples)
		X = np.random.rand(n_samples, n_features)
		y = np.random.randint(n_classes, size=(n_samples, n_tasks))
		w = np.ones((n_samples, n_tasks))
		multitask_dataset = dc.data.NumpyDataset(X, y, w)
		balancing_transformer = dc.trans.BalancingTransformer(
		transform_w=True, dataset=multitask_dataset)
		#X, y, w, ids = (multitask_dataset.X, multitask_dataset.y,
		# multitask_dataset.w, multitask_dataset.ids)
		multitask_dataset = balancing_transformer.transform(multitask_dataset)
		X_t, y_t, w_t, ids_t = (multitask_dataset.X, multitask_dataset.y,
		multitask_dataset.w, multitask_dataset.ids)
		# Check ids are unchanged.
		for id_elt, id_t_elt in zip(ids, ids_t):
		assert id_elt == id_t_elt
		# Check X is unchanged since this is a w transformer
		np.testing.assert_allclose(X, X_t)
		# Check y is unchanged since this is a w transformer
		np.testing.assert_allclose(y, y_t)
		for ind, task in enumerate(multitask_dataset.get_task_names()):
		y_task = y_t[:, ind]
		w_task = w_t[:, ind]
		w_orig_task = w[:, ind]
		# Assert that entries with zero weight retain zero weight
		np.testing.assert_allclose(w_task[w_orig_task == 0],
		np.zeros_like(w_task[w_orig_task == 0]))
		# Check that sum of 0s equals sum of 1s in transformed for each task
		assert np.isclose(
		np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))

		def test_multiclass_singletask(self):
		"""Test balancing transformer on single-task dataset."""
		n_samples = 50
		n_features = 3
		n_tasks = 1
		n_classes = 5
		ids = np.arange(n_samples)
		X = np.random.rand(n_samples, n_features)
		y = np.random.randint(n_classes, size=(n_samples, n_tasks))
		w = np.ones((n_samples, n_tasks))
		dataset = dc.data.NumpyDataset(X, y, w)

		balancing_transformer = dc.trans.BalancingTransformer(
		transform_w=True, dataset=dataset)
		dataset = balancing_transformer.transform(dataset)
		X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
		# Check ids are unchanged.
		for id_elt, id_t_elt in zip(ids, ids_t):
		assert id_elt == id_t_elt
		# Check X is unchanged since this is a w transformer
		np.testing.assert_allclose(X, X_t)
		# Check y is unchanged since this is a w transformer
		np.testing.assert_allclose(y, y_t)
		for ind, task in enumerate(dataset.get_task_names()):
		y_task = y_t[:, ind]
		w_task = w_t[:, ind]
		w_orig_task = w[:, ind]
		# Check that sum of 0s equals sum of 1s in transformed for each task
		for i, j in itertools.product(range(n_classes), range(n_classes)):
		if i == j:
		continue
		assert np.isclose(
		np.sum(w_task[y_task == i]), np.sum(w_task[y_task == j]))

deepchem/trans/tests/test_transformers.py

+0 −88

Original line number	Diff line number	Diff line
		@@ -18,35 +18,6 @@ import tensorflow as tf
		import scipy.ndimage


		def load_classification_data():
		"""Loads classification data from example.csv"""
		current_dir = os.path.dirname(os.path.abspath(__file__))
		featurizer = dc.feat.CircularFingerprint(size=1024)
		tasks = ["outcome"]
		task_type = "classification"
		input_file = os.path.join(current_dir,
		"../../models/tests/example_classification.csv")
		loader = dc.data.CSVLoader(
		tasks=tasks, smiles_field="smiles", featurizer=featurizer)
		return loader.featurize(input_file)


		def load_multitask_data():
		"""Load example multitask data."""
		current_dir = os.path.dirname(os.path.abspath(__file__))
		featurizer = dc.feat.CircularFingerprint(size=1024)
		tasks = [
		"task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7",
		"task8", "task9", "task10", "task11", "task12", "task13", "task14",
		"task15", "task16"
		]
		input_file = os.path.join(current_dir,
		"../../models/tests/multitask_example.csv")
		loader = dc.data.CSVLoader(
		tasks=tasks, smiles_field="smiles", featurizer=featurizer)
		return loader.featurize(input_file)


		def load_solubility_data():
		"""Loads solubility dataset"""
		current_dir = os.path.dirname(os.path.abspath(__file__))
		@@ -549,65 +520,6 @@ class TestTransformers(unittest.TestCase):
		# Check that untransform does the right thing.
		np.testing.assert_allclose(power_transformer.untransform(y_t), y)

		def test_singletask_balancing_transformer(self):
		"""Test balancing transformer on single-task dataset."""

		classification_dataset = load_classification_data()
		balancing_transformer = dc.trans.BalancingTransformer(
		transform_w=True, dataset=classification_dataset)
		X, y, w, ids = (classification_dataset.X, classification_dataset.y,
		classification_dataset.w, classification_dataset.ids)
		classification_dataset = balancing_transformer.transform(
		classification_dataset)
		X_t, y_t, w_t, ids_t = (classification_dataset.X, classification_dataset.y,
		classification_dataset.w,
		classification_dataset.ids)
		# Check ids are unchanged.
		for id_elt, id_t_elt in zip(ids, ids_t):
		assert id_elt == id_t_elt
		# Check X is unchanged since this is a w transformer
		np.testing.assert_allclose(X, X_t)
		# Check y is unchanged since this is a w transformer
		np.testing.assert_allclose(y, y_t)
		for ind, task in enumerate(classification_dataset.get_task_names()):
		y_task = y_t[:, ind]
		w_task = w_t[:, ind]
		w_orig_task = w[:, ind]
		# Assert that entries with zero weight retain zero weight
		np.testing.assert_allclose(w_task[w_orig_task == 0],
		np.zeros_like(w_task[w_orig_task == 0]))
		# Check that sum of 0s equals sum of 1s in transformed for each task
		assert np.isclose(
		np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))

		def test_multitask_balancing_transformer(self):
		"""Test balancing transformer on multitask dataset."""
		multitask_dataset = load_multitask_data()
		balancing_transformer = dc.trans.BalancingTransformer(
		transform_w=True, dataset=multitask_dataset)
		X, y, w, ids = (multitask_dataset.X, multitask_dataset.y,
		multitask_dataset.w, multitask_dataset.ids)
		multitask_dataset = balancing_transformer.transform(multitask_dataset)
		X_t, y_t, w_t, ids_t = (multitask_dataset.X, multitask_dataset.y,
		multitask_dataset.w, multitask_dataset.ids)
		# Check ids are unchanged.
		for id_elt, id_t_elt in zip(ids, ids_t):
		assert id_elt == id_t_elt
		# Check X is unchanged since this is a w transformer
		np.testing.assert_allclose(X, X_t)
		# Check y is unchanged since this is a w transformer
		np.testing.assert_allclose(y, y_t)
		for ind, task in enumerate(multitask_dataset.get_task_names()):
		y_task = y_t[:, ind]
		w_task = w_t[:, ind]
		w_orig_task = w[:, ind]
		# Assert that entries with zero weight retain zero weight
		np.testing.assert_allclose(w_task[w_orig_task == 0],
		np.zeros_like(w_task[w_orig_task == 0]))
		# Check that sum of 0s equals sum of 1s in transformed for each task
		assert np.isclose(
		np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))

		def test_coulomb_fit_transformer(self):
		"""Test coulomb fit transformer on singletask dataset."""
		n_samples = 10

deepchem/trans/transformers.py

+71 −22

Original line number	Diff line number	Diff line
		@@ -771,15 +771,37 @@ class LogTransformer(Transformer):
		class BalancingTransformer(Transformer):
		"""Balance positive and negative examples for weights.

		This class balances the sample weights so that the sum of all example
		weights from all classes is the same. This can be useful when you're
		working on an imbalanced dataset where there are far fewer examples of some
		classes than others.

		Example
		-------

		Here's an example for a binary dataset.

		>>> n_samples = 10
		>>> n_features = 3
		>>> n_tasks = 1
		>>> n_classes = 2
		>>> ids = np.arange(n_samples)
		>>> X = np.random.rand(n_samples, n_features)
		>>> y = np.random.randint(n_classes, size=(n_samples, n_tasks))
		>>> w = np.ones((n_samples, n_tasks))
		>>> dataset = dc.data.NumpyDataset(X, y, w, ids)
		>>> transformer = dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)
		>>> dataset = transformer.transform(dataset)

		And here's a multiclass dataset example.

		>>> n_samples = 50
		>>> n_features = 3
		>>> n_tasks = 1
		>>> n_classes = 5
		>>> ids = np.arange(n_samples)
		>>> X = np.random.rand(n_samples, n_features)
		>>> y = np.random.randint(2, size=(n_samples, n_tasks))
		>>> y = np.random.randint(n_classes, size=(n_samples, n_tasks))
		>>> w = np.ones((n_samples, n_tasks))
		>>> dataset = dc.data.NumpyDataset(X, y, w, ids)
		>>> transformer = dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)
		@@ -787,20 +809,21 @@ class BalancingTransformer(Transformer):

		Note
		----
		This class can only transform `w`. Note at present this class only supports
		binary datasets and not multiclass datasets.
		This transformer is only meaningful for classification datasets where `y`
		takes on a limited set of values. This class can only transform `w` and does
		not transform `X` or `y`.

		Raises
		------
		`ValueError` if `transform_X` or `transform_y` are set.
		`ValueError` if `transform_X` or `transform_y` are set. Also raises
		`ValueError` if `y` or `w` aren't of shape `(N,)` or `(N, n_tasks)`.
		"""

		def __init__(self,
		transform_X=False,
		transform_y=False,
		transform_w=False,
		dataset=None,
		seed=None):
		dataset=None):
		# BalancingTransformer can only transform weights.
		if transform_X or transform_y:
		raise ValueError("Cannot transform X or y")
		@@ -815,22 +838,35 @@ class BalancingTransformer(Transformer):
		# Compute weighting factors from dataset.
		y = dataset.y
		w = dataset.w
		# Handle 1-D case
		if len(y.shape) == 1:
		y = np.reshape(y, (len(y), 1))
		if len(w.shape) == 1:
		w = np.reshape(w, (len(w), 1))
		if len(y.shape) != 2:
		raise ValueError("y must be of shape (N,) or (N, n_tasks)")
		if len(w.shape) != 2:
		raise ValueError("w must be of shape (N,) or (N, n_tasks)")
		# Ensure dataset is binary
		np.testing.assert_allclose(sorted(np.unique(y)), np.array([0., 1.]))
		self.classes = sorted(np.unique(y))
		#np.testing.assert_allclose(sorted(np.unique(y)), np.array([0., 1.]))
		weights = []
		for ind, task in enumerate(dataset.get_task_names()):
		task_w = w[:, ind]
		task_y = y[:, ind]
		# Remove labels with zero weights
		task_y = task_y[task_w != 0]
		num_positives = np.count_nonzero(task_y)
		num_negatives = len(task_y) - num_positives
		if num_positives > 0:
		pos_weight = float(num_negatives) / num_positives
		else:
		pos_weight = 1
		neg_weight = 1
		weights.append((neg_weight, pos_weight))
		N_task = len(task_y)
		class_counts = []
		# Note that by definition of classes, num_c >= 1 for all classes
		for c in self.classes:
		# this works because task_y is 1D
		num_c = len(np.where(task_y == c)[0])
		class_counts.append(num_c)
		# This is the right ratio since N_task/num_c * num_c = N_task
		# for all classes
		class_weights = [N_task / float(num_c) for num_c in class_counts]
		weights.append(class_weights)
		self.weights = weights

		def transform_array(self, X, y, w):
		@@ -855,13 +891,26 @@ class BalancingTransformer(Transformer):
		Transformed array of weights
		"""
		w_balanced = np.zeros_like(w)
		for ind in range(y.shape[1]):
		if len(y.shape) == 1:
		n_tasks = 1
		elif len(y.shape) == 2:
		n_tasks = y.shape[1]
		else:
		raise ValueError("y must be of shape (N,) or (N, n_tasks)")
		for ind in range(n_tasks):
		if n_tasks == 1:
		task_y = y
		task_w = w
		else:
		task_y = y[:, ind]
		task_w = w[:, ind]
		zero_indices = np.logical_and(task_y == 0, task_w != 0)
		one_indices = np.logical_and(task_y == 1, task_w != 0)
		w_balanced[zero_indices, ind] = self.weights[ind][0]
		w_balanced[one_indices, ind] = self.weights[ind][1]
		for i, c in enumerate(self.classes):
		class_indices = np.logical_and(task_y == c, task_w != 0)
		# Set to the class weight computed previously
		if n_tasks == 1:
		w_balanced[class_indices] = self.weights[ind][i]
		else:
		w_balanced[class_indices, ind] = self.weights[ind][i]
		return (X, y, w_balanced)

Admin message