Commit f4c493d8 authored by Peter Eastman's avatar Peter Eastman
Browse files

Continuing refactoring of Dataset

parent 098095d4
Loading
Loading
Loading
Loading
+26 −42
Original line number Diff line number Diff line
@@ -131,6 +131,11 @@ class Dataset(object):
    """Get the names of the tasks associated with this dataset."""
    raise NotImplementedError()

  @property
  def X(self):
    """Get the X vector for this dataset as a single numpy array."""
    raise NotImplementedError()

  @property
  def y(self):
    """Get the y vector for this dataset as a single numpy array."""
@@ -147,15 +152,6 @@ class Dataset(object):
    """Get the weight vector for this dataset as a single numpy array."""
    raise NotImplementedError()

  def to_numpy(self):
    """
    Transforms internal data into arrays X, y, w, ids

    Creates three arrays containing all data in this object. This operation is
    dangerous (!) for large datasets which don't fit into memory.
    """
    raise NotImplementedError()

  def iterbatches(self, batch_size=None, epoch=0, deterministic=False, pad_batches=False):
    """Generator that iterates over minibatches from the dataset.
    
@@ -167,7 +163,7 @@ class Dataset(object):
class NumpyDataset(Dataset):
  """A Dataset defined by in-memory numpy arrays."""

  def __init__(self, X, y, w=None, ids=None):
  def __init__(self, X, y, w=None, ids=None, verbosity=None):
    n_samples = len(X)
    # The -1 indicates that y will be reshaped to have length -1
    if n_samples > 0:
@@ -183,6 +179,7 @@ class NumpyDataset(Dataset):
    self._y = y
    self._w = w
    self._ids = np.array(ids, dtype=object)
    self.verbosity = verbosity

  def __len__(self):
    """
@@ -201,6 +198,11 @@ class NumpyDataset(Dataset):
    """Get the names of the tasks associated with this dataset."""
    tasks = np.arange(self._y.shape[1])

  @property
  def X(self):
    """Get the X vector for this dataset as a single numpy array."""
    return self._X

  @property
  def y(self):
    """Get the y vector for this dataset as a single numpy array."""
@@ -216,15 +218,6 @@ class NumpyDataset(Dataset):
    """Get the weight vector for this dataset as a single numpy array."""
    return self._w

  def to_numpy(self):
    """
    Transforms internal data into arrays X, y, w, ids

    Creates three arrays containing all data in this object. This operation is
    dangerous (!) for large datasets which don't fit into memory.
    """
    return self._X, self._y, self._w, self._ids

  def iterbatches(self, batch_size=None, epoch=0, deterministic=False, pad_batches=False):
    """Generator that iterates over minibatches from the dataset.
    
@@ -524,7 +517,7 @@ class DiskDataset(Dataset):
        reshard_dir, new_basename, tasks, X_next, y_next, w_next, ids_next))
    ind += 1
    # Get new metadata rows
    resharded_dataset = Dataset(
    resharded_dataset = DiskDataset(
        data_dir=reshard_dir, tasks=tasks, metadata_rows=new_metadata,
        verbosity=self.verbosity)
    shutil.rmtree(self.data_dir)
@@ -561,12 +554,12 @@ class DiskDataset(Dataset):
    Xs, ys, ws, all_ids = [], [], [], []
    metadata_rows = []
    for ind, dataset in enumerate(datasets):
      X, y, w, ids = dataset.to_numpy()
      X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
      basename = "dataset-%d" % ind
      tasks = dataset.get_task_names()
      metadata_rows.append(
          DiskDataset.write_data_to_disk(merge_dir, basename, tasks, X, y, w, ids))
    return Dataset(data_dir=merge_dir,
    return DiskDataset(data_dir=merge_dir,
                   metadata_rows=metadata_rows,
                   verbosity=dataset.verbosity)

@@ -583,7 +576,7 @@ class DiskDataset(Dataset):
      basename = "dataset-%d" % shard_num
      metadata_rows.append(DiskDataset.write_data_to_disk(
          subset_dir, basename, tasks, X, y, w, ids))
    return Dataset(data_dir=subset_dir,
    return DiskDataset(data_dir=subset_dir,
                   metadata_rows=metadata_rows,
                   verbosity=self.verbosity)

@@ -761,7 +754,7 @@ class DiskDataset(Dataset):
      os.makedirs(select_dir)
    # Handle edge case with empty indices
    if not len(indices):
      return Dataset(
      return DiskDataset(
          data_dir=select_dir, metadata_rows=[], verbosity=self.verbosity)
    indices = np.array(sorted(indices)).astype(int)
    count, indices_count = 0, 0
@@ -790,7 +783,7 @@ class DiskDataset(Dataset):
      # Updating counts
      indices_count += num_shard_elts
      count += shard_len
    return Dataset(data_dir=select_dir,
    return DiskDataset(data_dir=select_dir,
                   metadata_rows=metadata_rows,
                   verbosity=self.verbosity)

@@ -828,23 +821,6 @@ class DiskDataset(Dataset):
        for (task_num, task) in enumerate(tasks)]
    return task_datasets

  def to_numpy(self):
    """
    Transforms internal data into arrays X, y, w, ids

    Creates three arrays containing all data in this object. This operation is
    dangerous (!) for large datasets which don't fit into memory.
    """
    Xs, ys, ws, ids = [], [], [], []
    for (X_b, y_b, w_b, ids_b) in self.itershards():
      Xs.append(X_b)
      ys.append(y_b)
      ws.append(w_b)
      ids.append(np.atleast_1d(np.squeeze(ids_b)))
    np.concatenate(ids)
    return (np.vstack(Xs), np.vstack(ys), np.vstack(ws),
            np.concatenate(ids))

  @property
  def ids(self):
    """Get the ids vector for this dataset as a single numpy array."""
@@ -855,6 +831,14 @@ class DiskDataset(Dataset):
      ids.append(np.atleast_1d(np.squeeze(ids_b)))
    return np.concatenate(ids)

  @property
  def X(self):
    """Get the X vector for this dataset as a single numpy array."""
    Xs = []
    for (X_b, _, _, _) in self.itershards():
      Xs.append(X_b)
    return np.vstack(Xs)

  @property
  def y(self):
    """Get the y vector for this dataset as a single numpy array."""
+5 −5
Original line number Diff line number Diff line
@@ -9,7 +9,7 @@ import os
import numpy as np
import shutil
from deepchem.utils.save import load_from_disk
from deepchem.datasets import Dataset
from deepchem.datasets import DiskDataset
from deepchem.featurizers.featurize import DataLoader
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import BalancingTransformer
@@ -59,7 +59,7 @@ def load_muv(base_dir, reload=True, frac_train=.8):
    dataset = loader.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)
    dataset = DiskDataset(data_dir, reload=True)

  # Initialize transformers 
  transformers = [
@@ -69,7 +69,7 @@ def load_muv(base_dir, reload=True, frac_train=.8):
    for transformer in transformers:
        transformer.transform(dataset)

  X, y, w, ids = dataset.to_numpy()
  X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
  num_tasks = 17
  num_train = frac_train * len(dataset)
  MUV_tasks = MUV_tasks[:num_tasks]
@@ -80,9 +80,9 @@ def load_muv(base_dir, reload=True, frac_train=.8):
  w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks]
  ids_train, ids_valid = ids[:num_train], ids[num_train:]

  train_dataset = Dataset.from_numpy(train_dir, X_train, y_train,
  train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train,
                                     w_train, ids_train, MUV_tasks)
  valid_dataset = Dataset.from_numpy(valid_dir, X_valid, y_valid,
  valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid,
                                     w_valid, ids_valid, MUV_tasks)
  
  return MUV_tasks, (train_dataset, valid_dataset), transformers
+5 −5
Original line number Diff line number Diff line
@@ -9,7 +9,7 @@ import os
import numpy as np
import shutil
from deepchem.utils.save import load_from_disk
from deepchem.datasets import Dataset
from deepchem.datasets import DiskDataset
from deepchem.featurizers.featurize import DataLoader
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import BalancingTransformer
@@ -80,7 +80,7 @@ def load_pcba(base_dir, reload=True, frac_train=.8):
    dataset = loader.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)
    dataset = DiskDataset(data_dir, reload=True)

  # Initialize transformers 
  transformers = [
@@ -93,7 +93,7 @@ def load_pcba(base_dir, reload=True, frac_train=.8):

  print("About to perform train/valid/test split.")
  num_train = frac_train * len(dataset)
  X, y, w, ids = dataset.to_numpy()
  X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
  num_tasks = 120
  PCBA_tasks = PCBA_tasks[:num_tasks]
  print("Using following tasks")
@@ -103,9 +103,9 @@ def load_pcba(base_dir, reload=True, frac_train=.8):
  w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks]
  ids_train, ids_valid = ids[:num_train], ids[num_train:]

  train_dataset = Dataset.from_numpy(train_dir, X_train, y_train,
  train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train,
                                     w_train, ids_train, PCBA_tasks)
  valid_dataset = Dataset.from_numpy(valid_dir, X_valid, y_valid,
  valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid,
                                     w_valid, ids_valid, PCBA_tasks)

  
+3 −3
Original line number Diff line number Diff line
@@ -12,7 +12,7 @@ import pandas as pd
import shutil
from rdkit import Chem
from deepchem.utils.save import load_from_disk
from deepchem.datasets import Dataset
from deepchem.datasets import DiskDataset
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import BalancingTransformer
from deepchem.featurizers.nnscore import NNScoreComplexFeaturizer
@@ -109,7 +109,7 @@ def load_core_pdbbind_coordinates(pdbbind_dir, base_dir, reload=True):
  X = np.array(features, dtype-object)
  w = np.ones_like(y)
   
  dataset = Dataset.from_numpy(data_dir, X, y, w, ids)
  dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids)
  transformers = []
  
  return tasks, dataset, transformers
@@ -176,7 +176,7 @@ def load_core_pdbbind_grid(pdbbind_dir, base_dir, reload=True):
  X = np.vstack(features)
  w = np.ones_like(y)
   
  dataset = Dataset.from_numpy(data_dir, X, y, w, ids)
  dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids)
  transformers = []
  
  return tasks, dataset, transformers
+11 −11
Original line number Diff line number Diff line
@@ -18,7 +18,7 @@ from deepchem.datasets import sparsify_features
from deepchem.datasets import densify_features
from deepchem.datasets import pad_batch
from deepchem.datasets import pad_features
from deepchem.datasets import Dataset
from deepchem.datasets import DiskDataset, NumpyDataset
from deepchem.featurizers.featurize import DataLoader
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import NormalizationTransformer
@@ -198,15 +198,15 @@ class TestBasicDatasetAPI(TestDatasetAPI):
  def test_reshard(self):
    """Test that resharding the dataset works."""
    solubility_dataset = self.load_solubility_data()
    X, y, w, ids = solubility_dataset.to_numpy()
    X, y, w, ids = (solubility_dataset.X, solubility_dataset.y, solubility_dataset.w, solubility_dataset.ids)
    assert solubility_dataset.get_number_shards() == 1
    solubility_dataset.reshard(shard_size=1)
    assert solubility_dataset.get_shard_size() == 1
    X_r, y_r, w_r, ids_r = solubility_dataset.to_numpy()
    X_r, y_r, w_r, ids_r = (solubility_dataset.X, solubility_dataset.y, solubility_dataset.w, solubility_dataset.ids)
    assert solubility_dataset.get_number_shards() == 10
    solubility_dataset.reshard(shard_size=10)
    assert solubility_dataset.get_shard_size() == 10
    X_rr, y_rr, w_rr, ids_rr = solubility_dataset.to_numpy()
    X_rr, y_rr, w_rr, ids_rr = (solubility_dataset.X, solubility_dataset.y, solubility_dataset.w, solubility_dataset.ids)

    # Test first resharding worked
    np.testing.assert_array_equal(X, X_r)
@@ -229,12 +229,12 @@ class TestBasicDatasetAPI(TestDatasetAPI):
    y = np.random.randint(2, size=(num_datapoints, num_tasks))
    w = np.ones((num_datapoints, num_tasks))
    ids = np.array(["id"] * num_datapoints)
    dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids)
    dataset = DiskDataset.from_numpy(self.data_dir, X, y, w, ids)

    select_dir = tempfile.mkdtemp()
    indices = [0, 4, 5, 8]
    select_dataset = dataset.select(select_dir, indices)
    X_sel, y_sel, w_sel, ids_sel = select_dataset.to_numpy()
    X_sel, y_sel, w_sel, ids_sel = (select_dataset.X, select_dataset.y, select_dataset.w, select_dataset.ids)
    np.testing.assert_array_equal(X[indices], X_sel)
    np.testing.assert_array_equal(y[indices], y_sel)
    np.testing.assert_array_equal(w[indices], w_sel)
@@ -252,7 +252,7 @@ class TestBasicDatasetAPI(TestDatasetAPI):
    w = np.random.randint(2, size=(num_datapoints, num_tasks))
    ids = np.array(["id"] * num_datapoints)
    
    dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids, verbosity="high")
    dataset = NumpyDataset(X, y, w, ids)

    X_shape, y_shape, w_shape, ids_shape = dataset.get_shape()
    assert X_shape == X.shape
@@ -272,7 +272,7 @@ class TestBasicDatasetAPI(TestDatasetAPI):
    w = np.random.randint(2, size=(num_datapoints, num_tasks))
    ids = np.array(["id"] * num_datapoints)
    
    dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids, verbosity="high")
    dataset = NumpyDataset(X, y, w, ids)

    task_dirs = []
    try:
@@ -281,7 +281,7 @@ class TestBasicDatasetAPI(TestDatasetAPI):
      singletask_datasets = dataset.to_singletask(task_dirs)
      for task in range(num_tasks):
        singletask_dataset = singletask_datasets[task]
        X_task, y_task, w_task, ids_task = singletask_dataset.to_numpy()
        X_task, y_task, w_task, ids_task = (singletask_dataset.X, singletask_dataset.y, singletask_dataset.w, singletask_dataset.ids)
        w_nonzero = w[:, task] != 0
        np.testing.assert_array_equal(X_task, X[w_nonzero != 0])
        np.testing.assert_array_equal(y_task.flatten(), y[:, task][w_nonzero != 0])
@@ -309,7 +309,7 @@ class TestBasicDatasetAPI(TestDatasetAPI):
    solubility_dataset = self.load_solubility_data()
    data_shape = solubility_dataset.get_data_shape()
    tasks = solubility_dataset.get_task_names()
    X, y, w, ids = solubility_dataset.to_numpy()
    X, y, w, ids = (solubility_dataset.X, solubility_dataset.y, solubility_dataset.w, solubility_dataset.ids)
    N_samples = len(solubility_dataset)
    N_tasks = len(tasks)
    
@@ -330,7 +330,7 @@ class TestBasicDatasetAPI(TestDatasetAPI):
  def test_get_statistics(self):
    """Test statistics computation of this dataset."""
    solubility_dataset = self.load_solubility_data()
    X, y, _, _ = solubility_dataset.to_numpy()
    X, y, _, _ = (solubility_dataset.X, solubility_dataset.y, solubility_dataset.w, solubility_dataset.ids)
    X_means, y_means = np.mean(X, axis=0), np.mean(y, axis=0)
    X_stds, y_stds = np.std(X, axis=0), np.std(y, axis=0)
    comp_X_means, comp_X_stds, comp_y_means, comp_y_stds = \
Loading