Sparse shuffle support (92d4aa76) · Commits · 钟慕尧 / deepchem

deepchem/datasets/init.py

+62 −4

Original line number	Diff line number	Diff line
		@@ -21,6 +21,26 @@ __author__ = "Bharath Ramsundar"
		__copyright__ = "Copyright 2016, Stanford University"
		__license__ = "GPL"

		def sparsify_features(X):
		"""Extracts a sparse feature representation from dense feature array."""
		n_samples = len(X)
		X_sparse = []
		for i in range(n_samples):
		nonzero_inds = np.nonzero(X[i])[0]
		nonzero_vals = X[i][nonzero_inds]
		X_sparse.append((nonzero_inds, nonzero_vals))
		X_sparse = np.array(X_sparse, dtype=object)
		return X_sparse

		def densify_features(X_sparse, num_features):
		"""Expands sparse feature representation to dense feature array."""
		n_samples = len(X_sparse)
		X = np.zeros((n_samples, num_features))
		for i in range(n_samples):
		nonzero_inds, nonzero_vals = X_sparse[i]
		X[i][nonzero_inds.astype(int)] = nonzero_vals
		return X

		def pad_features(batch_size, X_b):
		"""Pads a batch of features to have precisely batch_size elements.

		@@ -203,10 +223,6 @@ class Dataset(object):
		save_to_disk(X, os.path.join(data_dir, out_X))
		save_to_disk(X, os.path.join(data_dir, out_X_transformed))
		if compute_feature_statistics:
		########################################################## DEBUG
		print("compute_feature_statistics")
		print(compute_feature_statistics)
		########################################################## DEBUG
		X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X)
		save_to_disk(X_sums, os.path.join(data_dir, out_X_sums))
		save_to_disk(X_sum_squares, os.path.join(data_dir, out_X_sum_squares))
		@@ -443,6 +459,48 @@ class Dataset(object):
		self.verbosity)
		######################################################### TIMING

		def sparse_shuffle(self):
		"""Shuffling that exploits data sparsity to shuffle large datasets.

		Only for 1-dimensional feature vectors (does not work for tensorial
		featurizations).
		"""
		######################################################### TIMING
		time1 = time.time()
		######################################################### TIMING
		shard_size = self.get_shard_size()
		num_shards = self.get_number_shards()
		X_sparses, ys, ws, ids = [], [], [], []
		num_features = None
		for i in range(num_shards):
		(X_s, y_s, w_s, ids_s) = self.get_shard(i)
		if num_features is None:
		num_features = X_s.shape[1]
		X_sparse = sparsify_features(X_s)
		X_sparses, ys, ws, ids = (
		X_sparses + [X_sparse], ys + [y_s], ws + [w_s],
		ids + [np.atleast_1d(np.squeeze(ids_s))])
		# Get full dataset in memory
		(X_sparse, y, w, ids) = (
		np.vstack(X_sparses), np.vstack(ys), np.vstack(ws), np.concatenate(ids))
		# Shuffle in memory
		num_samples = len(X_sparse)
		permutation = np.random.permutation(num_samples)
		X_sparse, y, w, ids = (X_sparse[permutation], y[permutation],
		w[permutation], ids[permutation])
		# Write shuffled shards out to disk
		for i in range(num_shards):
		start, stop = ishard_size, (i+1)shard_size
		(X_sparse_s, y_s, w_s, ids_s) = (
		X_sparse[start:stop], y[start:stop], w[start:stop], ids[start:stop])
		X_s = densify_features(X_sparse_s, num_features)
		self.set_shard(i, X_s, y_s, w_s, ids_s)
		######################################################### TIMING
		time2 = time.time()
		log("TIMING: sparse_shuffle took %0.3f s" % (time2-time1),
		self.verbosity)
		######################################################### TIMING

		def shuffle(self, iterations=1):
		"""Shuffles this dataset on disk to have random order."""
		#np.random.seed(9452)

deepchem/datasets/tests/test_datasets.py

+26 −0

Original line number	Diff line number	Diff line
		@@ -14,6 +14,8 @@ import tempfile
		import os
		import shutil
		import numpy as np
		from deepchem.datasets import sparsify_features
		from deepchem.datasets import densify_features
		from deepchem.datasets import pad_batch
		from deepchem.datasets import pad_features
		from deepchem.datasets import Dataset
		@@ -27,6 +29,30 @@ class TestBasicDatasetAPI(TestDatasetAPI):
		Test basic top-level API for dataset objects.
		"""

		def test_sparsify_and_densify(self):
		"""Test that sparsify and densify work as inverses."""
		# Test on identity matrix
		num_samples = 10
		num_features = num_samples
		X = np.eye(num_samples)
		X_sparse = sparsify_features(X)
		X_reconstructed = densify_features(X_sparse, num_features)
		np.testing.assert_array_equal(X, X_reconstructed)

		# Generate random sparse features dataset
		np.random.seed(123)
		p = .05
		X = np.random.binomial(1, p, size=(num_samples, num_features))
		X_sparse = sparsify_features(X)
		X_reconstructed = densify_features(X_sparse, num_features)
		np.testing.assert_array_equal(X, X_reconstructed)

		# Test edge case with array of all zeros
		X = np.zeros((num_samples, num_features))
		X_sparse = sparsify_features(X)
		X_reconstructed = densify_features(X_sparse, num_features)
		np.testing.assert_array_equal(X, X_reconstructed)

		def test_pad_features(self):
		"""Test that pad_features pads features correctly."""
		batch_size = 100

deepchem/datasets/tests/test_shuffle.py

+34 −0

Original line number	Diff line number	Diff line
		@@ -57,6 +57,40 @@ class TestShuffle(TestAPI):
		assert y_orig.shape == y_new.shape
		assert w_orig.shape == w_new.shape

		def test_sparse_shuffle(self):
		"""Test that sparse datasets can be shuffled quickly."""
		verbosity = "high"
		current_dir = os.path.dirname(os.path.realpath(__file__))
		data_dir = os.path.join(self.base_dir, "dataset")

		dataset_file = os.path.join(
		current_dir, "../../models/tests/example.csv")

		featurizer = CircularFingerprint(size=1024)
		tasks = ["log-solubility"]
		loader = DataLoader(tasks=tasks,
		smiles_field="smiles",
		featurizer=featurizer,
		verbosity=verbosity)
		dataset = loader.featurize(
		dataset_file, data_dir, shard_size=2)

		X_orig, y_orig, w_orig, orig_ids = dataset.to_numpy()
		orig_len = len(dataset)

		dataset.sparse_shuffle()
		X_new, y_new, w_new, new_ids = dataset.to_numpy()

		assert len(dataset) == orig_len
		# The shuffling should have switched up the ordering
		assert not np.array_equal(orig_ids, new_ids)
		# But all the same entries should still be present
		assert sorted(orig_ids) == sorted(new_ids)
		# All the data should have same shape
		assert X_orig.shape == X_new.shape
		assert y_orig.shape == y_new.shape
		assert w_orig.shape == w_new.shape

		def test_reshard_shuffle(self):
		"""Test that datasets can be merged."""
		verbosity = "high"

deepchem/hyperparameters/init.py

+2 −1

Original line number	Diff line number	Diff line
		@@ -16,7 +16,8 @@ class HyperparamOpt(object):
		Provides simple hyperparameter search capabilities.
		"""

		def __init__(self, model_class, tasks, task_types, fit_transformers=None, verbosity=None):
		def __init__(self, model_class, tasks, task_types, fit_transformers=None,
		verbosity=None):
		self.model_class = model_class
		self.tasks = tasks
		self.task_types = task_types

deepchem/models/tensorflow_models/model_ops.py

+0 −15

Original line number	Diff line number	Diff line
		#!/usr/bin/python
		#
		# Copyright 2015 Google Inc.
		#
		# Licensed under the Apache License, Version 2.0 (the "License");
		# you may not use this file except in compliance with the License.
		# You may obtain a copy of the License at
		#
		# http://www.apache.org/licenses/LICENSE-2.0
		#
		# Unless required by applicable law or agreed to in writing, software
		# distributed under the License is distributed on an "AS IS" BASIS,
		# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		# See the License for the specific language governing permissions and
		# limitations under the License.
		"""Ops for graph construction."""
		from __future__ import print_function
		from __future__ import division

Admin message