Commit 92d4aa76 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Sparse shuffle support

parent b567fa17
Loading
Loading
Loading
Loading
+62 −4
Original line number Diff line number Diff line
@@ -21,6 +21,26 @@ __author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "GPL"

def sparsify_features(X):
  """Extracts a sparse feature representation from dense feature array."""
  n_samples = len(X)
  X_sparse = []
  for i in range(n_samples):
    nonzero_inds = np.nonzero(X[i])[0]
    nonzero_vals = X[i][nonzero_inds]
    X_sparse.append((nonzero_inds, nonzero_vals))
  X_sparse = np.array(X_sparse, dtype=object)
  return X_sparse

def densify_features(X_sparse, num_features):
  """Expands sparse feature representation to dense feature array."""
  n_samples = len(X_sparse)
  X = np.zeros((n_samples, num_features))
  for i in range(n_samples):
    nonzero_inds, nonzero_vals = X_sparse[i]
    X[i][nonzero_inds.astype(int)] = nonzero_vals
  return X

def pad_features(batch_size, X_b):
  """Pads a batch of features to have precisely batch_size elements.
  
@@ -203,10 +223,6 @@ class Dataset(object):
      save_to_disk(X, os.path.join(data_dir, out_X))
      save_to_disk(X, os.path.join(data_dir, out_X_transformed))
      if compute_feature_statistics:
        ########################################################## DEBUG
        print("compute_feature_statistics")
        print(compute_feature_statistics)
        ########################################################## DEBUG
        X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X)
        save_to_disk(X_sums, os.path.join(data_dir, out_X_sums))
        save_to_disk(X_sum_squares, os.path.join(data_dir, out_X_sum_squares))
@@ -443,6 +459,48 @@ class Dataset(object):
        self.verbosity)
    #########################################################  TIMING

  def sparse_shuffle(self):
    """Shuffling that exploits data sparsity to shuffle large datasets.

    Only for 1-dimensional feature vectors (does not work for tensorial
    featurizations).
    """
    #########################################################  TIMING
    time1 = time.time()
    #########################################################  TIMING
    shard_size = self.get_shard_size()
    num_shards = self.get_number_shards()
    X_sparses, ys, ws, ids = [], [], [], []
    num_features = None
    for i in range(num_shards):
      (X_s, y_s, w_s, ids_s) = self.get_shard(i) 
      if num_features is None:
        num_features = X_s.shape[1]
      X_sparse = sparsify_features(X_s) 
      X_sparses, ys, ws, ids = (
          X_sparses + [X_sparse], ys + [y_s], ws + [w_s],
          ids + [np.atleast_1d(np.squeeze(ids_s))])
    # Get full dataset in memory
    (X_sparse, y, w, ids) = (
        np.vstack(X_sparses), np.vstack(ys), np.vstack(ws), np.concatenate(ids))
    # Shuffle in memory
    num_samples = len(X_sparse)
    permutation = np.random.permutation(num_samples)
    X_sparse, y, w, ids = (X_sparse[permutation], y[permutation],
                           w[permutation], ids[permutation])
    # Write shuffled shards out to disk
    for i in range(num_shards):
      start, stop = i*shard_size, (i+1)*shard_size
      (X_sparse_s, y_s, w_s, ids_s) = (
          X_sparse[start:stop], y[start:stop], w[start:stop], ids[start:stop])
      X_s = densify_features(X_sparse_s, num_features)
      self.set_shard(i, X_s, y_s, w_s, ids_s)
    #########################################################  TIMING
    time2 = time.time()
    log("TIMING: sparse_shuffle took %0.3f s" % (time2-time1),
        self.verbosity)
    #########################################################  TIMING

  def shuffle(self, iterations=1):
    """Shuffles this dataset on disk to have random order."""
    #np.random.seed(9452)
+26 −0
Original line number Diff line number Diff line
@@ -14,6 +14,8 @@ import tempfile
import os
import shutil
import numpy as np
from deepchem.datasets import sparsify_features
from deepchem.datasets import densify_features
from deepchem.datasets import pad_batch
from deepchem.datasets import pad_features
from deepchem.datasets import Dataset
@@ -27,6 +29,30 @@ class TestBasicDatasetAPI(TestDatasetAPI):
  Test basic top-level API for dataset objects.
  """

  def test_sparsify_and_densify(self):
    """Test that sparsify and densify work as inverses."""
    # Test on identity matrix
    num_samples = 10
    num_features = num_samples
    X = np.eye(num_samples)
    X_sparse = sparsify_features(X)
    X_reconstructed = densify_features(X_sparse, num_features)
    np.testing.assert_array_equal(X, X_reconstructed)

    # Generate random sparse features dataset
    np.random.seed(123)
    p = .05
    X = np.random.binomial(1, p, size=(num_samples, num_features))
    X_sparse = sparsify_features(X)
    X_reconstructed = densify_features(X_sparse, num_features)
    np.testing.assert_array_equal(X, X_reconstructed)

    # Test edge case with array of all zeros
    X = np.zeros((num_samples, num_features))
    X_sparse = sparsify_features(X)
    X_reconstructed = densify_features(X_sparse, num_features)
    np.testing.assert_array_equal(X, X_reconstructed)

  def test_pad_features(self):
    """Test that pad_features pads features correctly."""
    batch_size = 100
+34 −0
Original line number Diff line number Diff line
@@ -57,6 +57,40 @@ class TestShuffle(TestAPI):
    assert y_orig.shape == y_new.shape
    assert w_orig.shape == w_new.shape

  def test_sparse_shuffle(self):
    """Test that sparse datasets can be shuffled quickly."""
    verbosity = "high"
    current_dir = os.path.dirname(os.path.realpath(__file__))
    data_dir = os.path.join(self.base_dir, "dataset")

    dataset_file = os.path.join(
        current_dir, "../../models/tests/example.csv")

    featurizer = CircularFingerprint(size=1024)
    tasks = ["log-solubility"]
    loader = DataLoader(tasks=tasks,
                        smiles_field="smiles",
                        featurizer=featurizer,
                        verbosity=verbosity)
    dataset = loader.featurize(
        dataset_file, data_dir, shard_size=2)

    X_orig, y_orig, w_orig, orig_ids = dataset.to_numpy()
    orig_len = len(dataset)

    dataset.sparse_shuffle()
    X_new, y_new, w_new, new_ids = dataset.to_numpy()
    
    assert len(dataset) == orig_len
    # The shuffling should have switched up the ordering
    assert not np.array_equal(orig_ids, new_ids)
    # But all the same entries should still be present
    assert sorted(orig_ids) == sorted(new_ids)
    # All the data should have same shape
    assert X_orig.shape == X_new.shape
    assert y_orig.shape == y_new.shape
    assert w_orig.shape == w_new.shape

  def test_reshard_shuffle(self):
    """Test that datasets can be merged."""
    verbosity = "high"
+2 −1
Original line number Diff line number Diff line
@@ -16,7 +16,8 @@ class HyperparamOpt(object):
  Provides simple hyperparameter search capabilities.
  """

  def __init__(self, model_class, tasks, task_types, fit_transformers=None, verbosity=None):
  def __init__(self, model_class, tasks, task_types, fit_transformers=None,
               verbosity=None):
    self.model_class = model_class
    self.tasks = tasks
    self.task_types = task_types
+0 −15
Original line number Diff line number Diff line
#!/usr/bin/python
#
# Copyright 2015 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Ops for graph construction."""
from __future__ import print_function
from __future__ import division
Loading