Commit f5840d17 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #220 from rbharath/padding

Padding support
parents c1302d05 372d514e
Loading
Loading
Loading
Loading
+73 −1
Original line number Diff line number Diff line
@@ -21,6 +21,74 @@ __author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "GPL"

def pad_features(batch_size, X_b):
  """Pads a batch of features to have precisely batch_size elements.
  
  Version of pad_batch for use at prediction time.
  """
  num_samples = len(X_b)
  if num_samples == batch_size:
    return X_b
  else:
    # By invariant of when this is called, can assume num_samples > 0
    # and num_samples < batch_size
    if len(X_b.shape) > 1:
      feature_shape = X_b.shape[1:]
      X_out = np.zeros((batch_size,) + feature_shape, dtype=X_b.dtype)
    else:
      X_out = np.zeros((batch_size,), dtype=X_b.dtype)

    # Fill in batch arrays 
    start = 0 
    while start < batch_size:
      num_left = batch_size - start 
      if num_left < num_samples:
        increment = num_left
      else:
        increment = num_samples
      X_out[start:start+increment] = X_b[:increment]
      start += increment
    return X_out

def pad_batch(batch_size, X_b, y_b, w_b, ids_b):
  """Pads batch to have size precisely batch_size elements.

  Fills in batch by wrapping around samples till whole batch is filled.
  """
  num_samples = len(X_b)
  if num_samples == batch_size:
    return (X_b, y_b, w_b, ids_b)
  else:
    # By invariant of when this is called, can assume num_samples > 0
    # and num_samples < batch_size
    if len(X_b.shape) > 1:
      feature_shape = X_b.shape[1:]
      X_out = np.zeros((batch_size,) + feature_shape, dtype=X_b.dtype)
    else:
      X_out = np.zeros((batch_size,), dtype=X_b.dtype)

    num_tasks = y_b.shape[1]
    y_out = np.zeros((batch_size, num_tasks), dtype=y_b.dtype) 
    w_out = np.zeros((batch_size, num_tasks), dtype=w_b.dtype)
    ids_out = np.zeros((batch_size,), dtype=ids_b.dtype)

    # Fill in batch arrays 
    start = 0 
    while start < batch_size:
      num_left = batch_size - start 
      if num_left < num_samples:
        increment = num_left
      else:
        increment = num_samples
      X_out[start:start+increment] = X_b[:increment]
      y_out[start:start+increment] = y_b[:increment]
      w_out[start:start+increment] = w_b[:increment]
      ids_out[start:start+increment] = ids_b[:increment]
      start += increment
    return (X_out, y_out, w_out, ids_out)
    
      

class Dataset(object):
  """
  Wrapper class for dataset transformed into X, y, w numpy ndarrays.
@@ -228,7 +296,8 @@ class Dataset(object):
          os.path.join(self.data_dir, row['ids'])), dtype=object)
      yield (X, y, w, ids)

  def iterbatches(self, batch_size=None, epoch=0, deterministic=False):
  def iterbatches(self, batch_size=None, epoch=0, deterministic=False,
                  pad_batches=False):
    """Returns minibatches from dataset randomly."""
    num_shards = self.get_number_shards()
    if not deterministic:
@@ -255,6 +324,9 @@ class Dataset(object):
        y_batch = y[perm_indices]
        w_batch = w[perm_indices]
        ids_batch = ids[perm_indices]
        if pad_batches:
          (X_batch, y_batch, w_batch, ids_batch) = pad_batch(
            shard_batch_size, X_batch, y_batch, w_batch, ids_batch)
        yield (X_batch, y_batch, w_batch, ids_batch)

  def reshard(self, shard_size):
+121 −8
Original line number Diff line number Diff line
@@ -14,6 +14,8 @@ import tempfile
import os
import shutil
import numpy as np
from deepchem.datasets import pad_batch
from deepchem.datasets import pad_features
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataLoader
from deepchem.featurizers.fingerprints import CircularFingerprint
@@ -24,6 +26,125 @@ class TestBasicDatasetAPI(TestDatasetAPI):
  """
  Test basic top-level API for dataset objects.
  """

  def test_pad_features(self):
    """Test that pad_features pads features correctly."""
    batch_size = 100
    num_features = 10
    num_tasks = 5
  
    # Test cases where n_samples < 2*n_samples < batch_size
    n_samples = 29
    X_b = np.zeros((n_samples, num_features))
  
    X_out = pad_features(batch_size, X_b)
    assert len(X_out) == batch_size

    # Test cases where n_samples < batch_size
    n_samples = 79
    X_b = np.zeros((n_samples, num_features))
    X_out = pad_features(batch_size, X_b)
    assert len(X_out) == batch_size

    # Test case where n_samples == batch_size
    n_samples = 100 
    X_b = np.zeros((n_samples, num_features))
    X_out = pad_features(batch_size, X_b)
    assert len(X_out) == batch_size

    # Test case for object featurization.
    n_samples = 2
    X_b = np.array([{"a": 1}, {"b": 2}])
    X_out = pad_features(batch_size, X_b)
    assert len(X_out) == batch_size

    # Test case for more complicated object featurization
    n_samples = 2
    X_b = np.array([(1, {"a": 1}), (2, {"b": 2})])
    X_out = pad_features(batch_size, X_b)
    assert len(X_out) == batch_size

    # Test case with multidimensional data
    n_samples = 50
    num_atoms = 15
    d = 3
    X_b = np.zeros((n_samples, num_atoms, d))
    X_out = pad_features(batch_size, X_b)
    assert len(X_out) == batch_size
  

  def test_pad_batches(self):
    """Test that pad_batch pads batches correctly."""
    batch_size = 100
    num_features = 10
    num_tasks = 5
  
    # Test cases where n_samples < 2*n_samples < batch_size
    n_samples = 29
    X_b = np.zeros((n_samples, num_features))
    y_b = np.zeros((n_samples, num_tasks))
    w_b = np.zeros((n_samples, num_tasks))
    ids_b = np.zeros((n_samples,))
  
    X_out, y_out, w_out, ids_out = pad_batch(
        batch_size, X_b, y_b, w_b, ids_b)
    assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size

    # Test cases where n_samples < batch_size
    n_samples = 79
    X_b = np.zeros((n_samples, num_features))
    y_b = np.zeros((n_samples, num_tasks))
    w_b = np.zeros((n_samples, num_tasks))
    ids_b = np.zeros((n_samples,))
  
    X_out, y_out, w_out, ids_out = pad_batch(
        batch_size, X_b, y_b, w_b, ids_b)
    assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size

    # Test case where n_samples == batch_size
    n_samples = 100 
    X_b = np.zeros((n_samples, num_features))
    y_b = np.zeros((n_samples, num_tasks))
    w_b = np.zeros((n_samples, num_tasks))
    ids_b = np.zeros((n_samples,))
  
    X_out, y_out, w_out, ids_out = pad_batch(
        batch_size, X_b, y_b, w_b, ids_b)
    assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size

    # Test case for object featurization.
    n_samples = 2
    X_b = np.array([{"a": 1}, {"b": 2}])
    y_b = np.zeros((n_samples, num_tasks))
    w_b = np.zeros((n_samples, num_tasks))
    ids_b = np.zeros((n_samples,))
    X_out, y_out, w_out, ids_out = pad_batch(
        batch_size, X_b, y_b, w_b, ids_b)
    assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size

    # Test case for more complicated object featurization
    n_samples = 2
    X_b = np.array([(1, {"a": 1}), (2, {"b": 2})])
    y_b = np.zeros((n_samples, num_tasks))
    w_b = np.zeros((n_samples, num_tasks))
    ids_b = np.zeros((n_samples,))
    X_out, y_out, w_out, ids_out = pad_batch(
        batch_size, X_b, y_b, w_b, ids_b)
    assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size

    # Test case with multidimensional data
    n_samples = 50
    num_atoms = 15
    d = 3
    X_b = np.zeros((n_samples, num_atoms, d))
    y_b = np.zeros((n_samples, num_tasks))
    w_b = np.zeros((n_samples, num_tasks))
    ids_b = np.zeros((n_samples,))
  
    X_out, y_out, w_out, ids_out = pad_batch(
        batch_size, X_b, y_b, w_b, ids_b)
    assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size
    
  def test_get_task_names(self):
    """Test that get_task_names returns correct task_names"""
    solubility_dataset = self.load_solubility_data()
@@ -108,14 +229,6 @@ class TestBasicDatasetAPI(TestDatasetAPI):
    dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids, verbosity="high")

    X_shape, y_shape, w_shape, ids_shape = dataset.get_shape()
    print("type(X_shape), type(y_shape), type(w_shape), type(ids_shape)")
    print(type(X_shape), type(y_shape), type(w_shape), type(ids_shape))
    print("type(X.shape), type(y.shape), type(w.shape), type(ids.shape)")
    print(type(X.shape), type(y.shape), type(w.shape), type(ids.shape))
    print("X_shape, y_shape, w_shape, ids_shape")
    print(X_shape, y_shape, w_shape, ids_shape)
    print("X.shape, y.shape, w.shape, ids.shape")
    print(X.shape, y.shape, w.shape, ids.shape)
    assert X_shape == X.shape
    assert y_shape == y.shape
    assert w_shape == w.shape
+16 −4
Original line number Diff line number Diff line
@@ -169,9 +169,13 @@ class NeighborListAtomicCoordinates(Featurizer):
    Threshold distance [Angstroms] for counting neighbors.
  """ 

  def __init__(self, neighbor_cutoff=4):
  def __init__(self, max_num_neighbors=None, neighbor_cutoff=4):
    if neighbor_cutoff <= 0:
      raise ValueError("neighbor_cutoff must be positive value.")
    if max_num_neighbors is not None:
      if not isinstance(max_num_neighbors, int) or max_num_neighbors <= 0:
        raise ValueError("max_num_neighbors must be positive integer.")
    self.max_num_neighbors = max_num_neighbors
    self.neighbor_cutoff = neighbor_cutoff
    # Type of data created by this featurizer
    self.dtype = object
@@ -219,9 +223,17 @@ class NeighborListAtomicCoordinates(Featurizer):
            continue
          # TODO(rbharath): How does distance need to be modified here to
          # account for periodic boundary conditions?
          if np.linalg.norm(coords[atom] - coords[neighbor_atom]) < self.neighbor_cutoff:
            neighbor_list[atom].add(neighbor_atom)
          dist = np.linalg.norm(coords[atom] - coords[neighbor_atom])
          if dist < self.neighbor_cutoff:
            neighbor_list[atom].add((neighbor_atom, dist))
          
      # Sort neighbors by distance
      closest_neighbors = sorted(
          list(neighbor_list[atom]), key=lambda elt: elt[1])
      closest_neighbors = [nbr for (nbr, dist) in closest_neighbors]
      # Pick up to max_num_neighbors
      closest_neighbors = closest_neighbors[:self.max_num_neighbors]
      neighbor_list[atom] = closest_neighbors

      neighbor_list[atom] = sorted(list(neighbor_list[atom]))
        
    return (bohr_coords, neighbor_list)
+34 −0
Original line number Diff line number Diff line
@@ -194,3 +194,37 @@ class TestAtomicCoordinates(unittest.TestCase):
    nblist = nblist_featurizer._featurize(self.mol)[1]
    for atom in range(N):
      assert len(nblist[atom]) == N-1

  def test_neighbor_list_max_num_neighbors(self):
    """
    Test that neighbor lists return only max_num_neighbors.
    """
    N = self.mol.GetNumAtoms()

    max_num_neighbors = 1
    nblist_featurizer = NeighborListAtomicCoordinates(max_num_neighbors)
    nblist = nblist_featurizer._featurize(self.mol)[1]

    for atom in range(N):
      assert len(nblist[atom]) <= max_num_neighbors

    # Do a manual distance computation and ensure that selected neighbor is
    # closest since we set max_num_neighbors = 1
    coords = get_coords(self.mol)
    for i in range(N):
      closest_dist = np.inf
      closest_nbr = None
      for j in range(N):
        if i == j:
          continue
        dist = np.linalg.norm(coords[i] - coords[j])
        print("Distance(%d, %d) = %f" % (i, j, dist))
        if dist < closest_dist:
          closest_dist = dist
          closest_nbr = j
      print("Closest neighbor to %d is %d" % (i, closest_nbr))
      print("Distance: %f" % closest_dist)
      if closest_dist < nblist_featurizer.neighbor_cutoff:
        assert nblist[i] == [closest_nbr]
      else:
        assert nblist[i] == []
+0 −1
Original line number Diff line number Diff line
@@ -81,7 +81,6 @@ class HyperparamOpt(object):
        model = self.model_class(
            self.tasks, self.task_types, model_params, model_dir,
            verbosity=self.verbosity)
        
      model.fit(train_dataset)
      model.save()
    
Loading