Merge pull request #220 from rbharath/padding (f5840d17) · Commits · 钟慕尧 / deepchem

deepchem/datasets/init.py

+73 −1

Original line number	Diff line number	Diff line
		@@ -21,6 +21,74 @@ __author__ = "Bharath Ramsundar"
		__copyright__ = "Copyright 2016, Stanford University"
		__license__ = "GPL"

		def pad_features(batch_size, X_b):
		"""Pads a batch of features to have precisely batch_size elements.

		Version of pad_batch for use at prediction time.
		"""
		num_samples = len(X_b)
		if num_samples == batch_size:
		return X_b
		else:
		# By invariant of when this is called, can assume num_samples > 0
		# and num_samples < batch_size
		if len(X_b.shape) > 1:
		feature_shape = X_b.shape[1:]
		X_out = np.zeros((batch_size,) + feature_shape, dtype=X_b.dtype)
		else:
		X_out = np.zeros((batch_size,), dtype=X_b.dtype)

		# Fill in batch arrays
		start = 0
		while start < batch_size:
		num_left = batch_size - start
		if num_left < num_samples:
		increment = num_left
		else:
		increment = num_samples
		X_out[start:start+increment] = X_b[:increment]
		start += increment
		return X_out

		def pad_batch(batch_size, X_b, y_b, w_b, ids_b):
		"""Pads batch to have size precisely batch_size elements.

		Fills in batch by wrapping around samples till whole batch is filled.
		"""
		num_samples = len(X_b)
		if num_samples == batch_size:
		return (X_b, y_b, w_b, ids_b)
		else:
		# By invariant of when this is called, can assume num_samples > 0
		# and num_samples < batch_size
		if len(X_b.shape) > 1:
		feature_shape = X_b.shape[1:]
		X_out = np.zeros((batch_size,) + feature_shape, dtype=X_b.dtype)
		else:
		X_out = np.zeros((batch_size,), dtype=X_b.dtype)

		num_tasks = y_b.shape[1]
		y_out = np.zeros((batch_size, num_tasks), dtype=y_b.dtype)
		w_out = np.zeros((batch_size, num_tasks), dtype=w_b.dtype)
		ids_out = np.zeros((batch_size,), dtype=ids_b.dtype)

		# Fill in batch arrays
		start = 0
		while start < batch_size:
		num_left = batch_size - start
		if num_left < num_samples:
		increment = num_left
		else:
		increment = num_samples
		X_out[start:start+increment] = X_b[:increment]
		y_out[start:start+increment] = y_b[:increment]
		w_out[start:start+increment] = w_b[:increment]
		ids_out[start:start+increment] = ids_b[:increment]
		start += increment
		return (X_out, y_out, w_out, ids_out)



		class Dataset(object):
		"""
		Wrapper class for dataset transformed into X, y, w numpy ndarrays.
		@@ -228,7 +296,8 @@ class Dataset(object):
		os.path.join(self.data_dir, row['ids'])), dtype=object)
		yield (X, y, w, ids)

		def iterbatches(self, batch_size=None, epoch=0, deterministic=False):
		def iterbatches(self, batch_size=None, epoch=0, deterministic=False,
		pad_batches=False):
		"""Returns minibatches from dataset randomly."""
		num_shards = self.get_number_shards()
		if not deterministic:
		@@ -255,6 +324,9 @@ class Dataset(object):
		y_batch = y[perm_indices]
		w_batch = w[perm_indices]
		ids_batch = ids[perm_indices]
		if pad_batches:
		(X_batch, y_batch, w_batch, ids_batch) = pad_batch(
		shard_batch_size, X_batch, y_batch, w_batch, ids_batch)
		yield (X_batch, y_batch, w_batch, ids_batch)

		def reshard(self, shard_size):

deepchem/datasets/tests/test_datasets.py

+121 −8

Original line number	Diff line number	Diff line
		@@ -14,6 +14,8 @@ import tempfile
		import os
		import shutil
		import numpy as np
		from deepchem.datasets import pad_batch
		from deepchem.datasets import pad_features
		from deepchem.datasets import Dataset
		from deepchem.featurizers.featurize import DataLoader
		from deepchem.featurizers.fingerprints import CircularFingerprint
		@@ -24,6 +26,125 @@ class TestBasicDatasetAPI(TestDatasetAPI):
		"""
		Test basic top-level API for dataset objects.
		"""

		def test_pad_features(self):
		"""Test that pad_features pads features correctly."""
		batch_size = 100
		num_features = 10
		num_tasks = 5

		# Test cases where n_samples < 2*n_samples < batch_size
		n_samples = 29
		X_b = np.zeros((n_samples, num_features))

		X_out = pad_features(batch_size, X_b)
		assert len(X_out) == batch_size

		# Test cases where n_samples < batch_size
		n_samples = 79
		X_b = np.zeros((n_samples, num_features))
		X_out = pad_features(batch_size, X_b)
		assert len(X_out) == batch_size

		# Test case where n_samples == batch_size
		n_samples = 100
		X_b = np.zeros((n_samples, num_features))
		X_out = pad_features(batch_size, X_b)
		assert len(X_out) == batch_size

		# Test case for object featurization.
		n_samples = 2
		X_b = np.array([{"a": 1}, {"b": 2}])
		X_out = pad_features(batch_size, X_b)
		assert len(X_out) == batch_size

		# Test case for more complicated object featurization
		n_samples = 2
		X_b = np.array([(1, {"a": 1}), (2, {"b": 2})])
		X_out = pad_features(batch_size, X_b)
		assert len(X_out) == batch_size

		# Test case with multidimensional data
		n_samples = 50
		num_atoms = 15
		d = 3
		X_b = np.zeros((n_samples, num_atoms, d))
		X_out = pad_features(batch_size, X_b)
		assert len(X_out) == batch_size


		def test_pad_batches(self):
		"""Test that pad_batch pads batches correctly."""
		batch_size = 100
		num_features = 10
		num_tasks = 5

		# Test cases where n_samples < 2*n_samples < batch_size
		n_samples = 29
		X_b = np.zeros((n_samples, num_features))
		y_b = np.zeros((n_samples, num_tasks))
		w_b = np.zeros((n_samples, num_tasks))
		ids_b = np.zeros((n_samples,))

		X_out, y_out, w_out, ids_out = pad_batch(
		batch_size, X_b, y_b, w_b, ids_b)
		assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size

		# Test cases where n_samples < batch_size
		n_samples = 79
		X_b = np.zeros((n_samples, num_features))
		y_b = np.zeros((n_samples, num_tasks))
		w_b = np.zeros((n_samples, num_tasks))
		ids_b = np.zeros((n_samples,))

		X_out, y_out, w_out, ids_out = pad_batch(
		batch_size, X_b, y_b, w_b, ids_b)
		assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size

		# Test case where n_samples == batch_size
		n_samples = 100
		X_b = np.zeros((n_samples, num_features))
		y_b = np.zeros((n_samples, num_tasks))
		w_b = np.zeros((n_samples, num_tasks))
		ids_b = np.zeros((n_samples,))

		X_out, y_out, w_out, ids_out = pad_batch(
		batch_size, X_b, y_b, w_b, ids_b)
		assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size

		# Test case for object featurization.
		n_samples = 2
		X_b = np.array([{"a": 1}, {"b": 2}])
		y_b = np.zeros((n_samples, num_tasks))
		w_b = np.zeros((n_samples, num_tasks))
		ids_b = np.zeros((n_samples,))
		X_out, y_out, w_out, ids_out = pad_batch(
		batch_size, X_b, y_b, w_b, ids_b)
		assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size

		# Test case for more complicated object featurization
		n_samples = 2
		X_b = np.array([(1, {"a": 1}), (2, {"b": 2})])
		y_b = np.zeros((n_samples, num_tasks))
		w_b = np.zeros((n_samples, num_tasks))
		ids_b = np.zeros((n_samples,))
		X_out, y_out, w_out, ids_out = pad_batch(
		batch_size, X_b, y_b, w_b, ids_b)
		assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size

		# Test case with multidimensional data
		n_samples = 50
		num_atoms = 15
		d = 3
		X_b = np.zeros((n_samples, num_atoms, d))
		y_b = np.zeros((n_samples, num_tasks))
		w_b = np.zeros((n_samples, num_tasks))
		ids_b = np.zeros((n_samples,))

		X_out, y_out, w_out, ids_out = pad_batch(
		batch_size, X_b, y_b, w_b, ids_b)
		assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size

		def test_get_task_names(self):
		"""Test that get_task_names returns correct task_names"""
		solubility_dataset = self.load_solubility_data()
		@@ -108,14 +229,6 @@ class TestBasicDatasetAPI(TestDatasetAPI):
		dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids, verbosity="high")

		X_shape, y_shape, w_shape, ids_shape = dataset.get_shape()
		print("type(X_shape), type(y_shape), type(w_shape), type(ids_shape)")
		print(type(X_shape), type(y_shape), type(w_shape), type(ids_shape))
		print("type(X.shape), type(y.shape), type(w.shape), type(ids.shape)")
		print(type(X.shape), type(y.shape), type(w.shape), type(ids.shape))
		print("X_shape, y_shape, w_shape, ids_shape")
		print(X_shape, y_shape, w_shape, ids_shape)
		print("X.shape, y.shape, w.shape, ids.shape")
		print(X.shape, y.shape, w.shape, ids.shape)
		assert X_shape == X.shape
		assert y_shape == y.shape
		assert w_shape == w.shape

deepchem/featurizers/atomic_coordinates.py

+16 −4

Original line number	Diff line number	Diff line
		@@ -169,9 +169,13 @@ class NeighborListAtomicCoordinates(Featurizer):
		Threshold distance [Angstroms] for counting neighbors.
		"""

		def __init__(self, neighbor_cutoff=4):
		def __init__(self, max_num_neighbors=None, neighbor_cutoff=4):
		if neighbor_cutoff <= 0:
		raise ValueError("neighbor_cutoff must be positive value.")
		if max_num_neighbors is not None:
		if not isinstance(max_num_neighbors, int) or max_num_neighbors <= 0:
		raise ValueError("max_num_neighbors must be positive integer.")
		self.max_num_neighbors = max_num_neighbors
		self.neighbor_cutoff = neighbor_cutoff
		# Type of data created by this featurizer
		self.dtype = object
		@@ -219,9 +223,17 @@ class NeighborListAtomicCoordinates(Featurizer):
		continue
		# TODO(rbharath): How does distance need to be modified here to
		# account for periodic boundary conditions?
		if np.linalg.norm(coords[atom] - coords[neighbor_atom]) < self.neighbor_cutoff:
		neighbor_list[atom].add(neighbor_atom)
		dist = np.linalg.norm(coords[atom] - coords[neighbor_atom])
		if dist < self.neighbor_cutoff:
		neighbor_list[atom].add((neighbor_atom, dist))

		# Sort neighbors by distance
		closest_neighbors = sorted(
		list(neighbor_list[atom]), key=lambda elt: elt[1])
		closest_neighbors = [nbr for (nbr, dist) in closest_neighbors]
		# Pick up to max_num_neighbors
		closest_neighbors = closest_neighbors[:self.max_num_neighbors]
		neighbor_list[atom] = closest_neighbors

		neighbor_list[atom] = sorted(list(neighbor_list[atom]))

		return (bohr_coords, neighbor_list)

deepchem/featurizers/tests/test_atomic_coordinates.py

+34 −0

Original line number	Diff line number	Diff line
		@@ -194,3 +194,37 @@ class TestAtomicCoordinates(unittest.TestCase):
		nblist = nblist_featurizer._featurize(self.mol)[1]
		for atom in range(N):
		assert len(nblist[atom]) == N-1

		def test_neighbor_list_max_num_neighbors(self):
		"""
		Test that neighbor lists return only max_num_neighbors.
		"""
		N = self.mol.GetNumAtoms()

		max_num_neighbors = 1
		nblist_featurizer = NeighborListAtomicCoordinates(max_num_neighbors)
		nblist = nblist_featurizer._featurize(self.mol)[1]

		for atom in range(N):
		assert len(nblist[atom]) <= max_num_neighbors

		# Do a manual distance computation and ensure that selected neighbor is
		# closest since we set max_num_neighbors = 1
		coords = get_coords(self.mol)
		for i in range(N):
		closest_dist = np.inf
		closest_nbr = None
		for j in range(N):
		if i == j:
		continue
		dist = np.linalg.norm(coords[i] - coords[j])
		print("Distance(%d, %d) = %f" % (i, j, dist))
		if dist < closest_dist:
		closest_dist = dist
		closest_nbr = j
		print("Closest neighbor to %d is %d" % (i, closest_nbr))
		print("Distance: %f" % closest_dist)
		if closest_dist < nblist_featurizer.neighbor_cutoff:
		assert nblist[i] == [closest_nbr]
		else:
		assert nblist[i] == []

deepchem/hyperparameters/init.py

+0 −1

Original line number	Diff line number	Diff line
		@@ -81,7 +81,6 @@ class HyperparamOpt(object):
		model = self.model_class(
		self.tasks, self.task_types, model_params, model_dir,
		verbosity=self.verbosity)

		model.fit(train_dataset)
		model.save()

Admin message