Commit 88dd25b5 authored by nd-02110114's avatar nd-02110114
Browse files

Merge branch 'master' into fix-base-classes-feat

parents 3dcbd15d 588f8a77
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -30,7 +30,7 @@ install:
  - hash -r
  - conda config --set always_yes yes --set changeps1 no
  - conda update -q conda
  - bash scripts/install_deepchem_conda.sh deepchem
  - bash scripts/install_deepchem_conda.sh cpu
  - conda activate deepchem
  - python setup.py install
script:
+91 −38
Original line number Diff line number Diff line
@@ -1352,7 +1352,7 @@ class DiskDataset(Dataset):
      w_next = np.zeros((0,) + w_shape[1:])
      ids_next = np.zeros((0,), dtype=object)
      for shard_num, (X, y, w, ids) in enumerate(self.itershards()):
        logger.info("Resharding shard %d/%d" % (shard_num, n_shards))
        logger.info("Resharding shard %d/%d" % (shard_num + 1, n_shards))
        # Handle shapes
        X = np.reshape(X, (len(X),) + self.get_data_shape())
        # Note that this means that DiskDataset resharding currently doesn't
@@ -1816,8 +1816,18 @@ class DiskDataset(Dataset):
  def sparse_shuffle(self) -> None:
    """Shuffling that exploits data sparsity to shuffle large datasets.
    
    Only for 1-dimensional feature vectors (does not work for tensorial
    featurizations).
    If feature vectors are sparse, say circular fingerprints or any other
    representation that contains few nonzero values, it can be possible to
    exploit the sparsity of the vector to simplify shuffles. This method
    implements a sparse shuffle by compressing sparse feature vectors down
    into a compressed representation, then shuffles this compressed dataset in
    memory and writes the results to disk.

    Note
    ----
    This method only works for 1-dimensional feature vectors (does not work
    for tensorial featurizations). Note that this shuffle is performed in
    place.
    """
    time1 = time.time()
    shard_size = self.get_shard_size()
@@ -1855,52 +1865,84 @@ class DiskDataset(Dataset):
    logger.info("TIMING: sparse_shuffle took %0.3f s" % (time2 - time1))

  def complete_shuffle(self, data_dir: Optional[str] = None) -> "DiskDataset":
    """
    Completely shuffle across all data, across all shards.
    """Completely shuffle across all data, across all shards.

    Note: this loads all the data into ram, and can be prohibitively
    expensive for larger datasets.
    Note
    ----
    The algorithm used for this complete shuffle is O(N^2) where N is the
    number of shards. It simply constructs each shard of the output dataset
    one at a time. Since the complete shuffle can take a long time, it's
    useful to watch the logging output. Each shuffled shard is constructed
    using select() which logs as it selects from each original shard. This
    will results in O(N^2) logging statements, one for each extraction of
    shuffled shard i's contributions from original shard j.

    Parameters
    ----------
    shard_size: int
      size of the resulting dataset's size. If None, then the first
      shard's shard_size will be used.
    data_dir: Optional[str], (default None)
      Directory to write the shuffled dataset to. If none is specified a
      temporary directory will be used.

    Returns
    -------
    DiskDataset
      A DiskDataset with a single shard.

      A DiskDataset whose data is a randomly shuffled version of this dataset. 
    """
    # Create temp directory to store shuffled version
    shuffle_dir = tempfile.mkdtemp()
    n_shards = self.get_number_shards()
    N = len(self)
    perm = np.random.permutation(N)
    shard_size = self.get_shard_size()

    def generator():
      start = 0
      shard_num = 0
      while start < N:
        logger.info("Constructing shard %d" % shard_num)
        if start + shard_size < N:
          end = start + shard_size
        else:
          end = N
        shard_indices = perm[start:end]
        # Note that this is in sorted order which doesn't respect the random
        # permutation.
        shard_dataset = self.select(shard_indices)
        # One bit of trickiness here is that select() will return in sorted
        # order. For example, suppose we'd like these elements in our permuted
        # shard:
        #
        # [12, 234, 1, 4]
        #
        # Then select would return elements in order
        #
        # [1, 4, 12, 234]
        #
        # We need to recover the original ordering. We can do this by using
        # np.where to find the locatios of the original indices in the sorted
        # indices.
        sorted_indices = np.array(sorted(shard_indices))
        reverted_indices = np.array(
            # We know there's only one match for np.where since this is a
            # permutation, so the [0][0] pulls out the exact match location.
            [
                np.where(sorted_indices == orig_index)[0][0]
                for orig_index in shard_indices
            ])
        # Let's pull out shard elements
        shard_X, shard_y, shard_w, shard_ids = (shard_dataset.X,
                                                shard_dataset.y,
                                                shard_dataset.w,
                                                shard_dataset.ids)

        yield (shard_X[reverted_indices], shard_y[reverted_indices],
               shard_w[reverted_indices], shard_ids[reverted_indices])

        start = end
        shard_num += 1

    all_X = []
    all_y = []
    all_w = []
    all_ids = []
    for Xs, ys, ws, ids in self.itershards():
      all_X.append(Xs)
      if ys is not None:
        all_y.append(ys)
      if ws is not None:
        all_w.append(ws)
      all_ids.append(ids)

    Xs = np.concatenate(all_X)
    ys = np.concatenate(all_y)
    ws = np.concatenate(all_w)
    ids = np.concatenate(all_ids)

    perm = np.random.permutation(Xs.shape[0])
    Xs = Xs[perm]
    ys = ys[perm]
    ws = ws[perm]
    ids = ids[perm]

    return DiskDataset.from_numpy(Xs, ys, ws, ids, data_dir=data_dir)
    return DiskDataset.create_dataset(
        generator(), data_dir=data_dir, tasks=self.get_task_names())

  def shuffle_each_shard(self,
                         shard_basenames: Optional[List[str]] = None) -> None:
@@ -2057,16 +2099,27 @@ class DiskDataset(Dataset):
    self._cached_shards = None

  def select(self, indices: Sequence[int],
             select_dir: str = None) -> "DiskDataset":
             select_dir: Optional[str] = None) -> "DiskDataset":
    """Creates a new dataset from a selection of indices from self.

    Note
    ----
    The specified indices will be returned in sorted order. That is, if you
    request that indices `[3, 1, 2]` are returned, you will get a
    `DiskDataset` which contains elements in order `[1, 2, 3]`.

    Parameters
    ----------
    indices: list
      List of indices to select.
    select_dir: string
    select_dir: Optional[str], (default None)
      Path to new directory that the selected indices will be copied
      to.

    Returns
    -------
    DiskDataset
      Contains selected indices.
    """
    if select_dir is not None:
      if not os.path.exists(select_dir):
+45 −8
Original line number Diff line number Diff line
"""
Testing singletask/multitask dataset shuffling 
"""
__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import os
import shutil
import tempfile
@@ -13,6 +9,47 @@ import deepchem as dc
import numpy as np


def test_complete_shuffle_one_shard():
  """Test that complete shuffle works with only one shard."""
  X = np.random.rand(10, 10)
  dataset = dc.data.DiskDataset.from_numpy(X)
  shuffled = dataset.complete_shuffle()
  assert len(shuffled) == len(dataset)
  assert not np.array_equal(shuffled.ids, dataset.ids)
  assert sorted(shuffled.ids) == sorted(dataset.ids)
  assert shuffled.X.shape == dataset.X.shape
  assert shuffled.y.shape == dataset.y.shape
  assert shuffled.w.shape == dataset.w.shape


def test_complete_shuffle_multiple_shard():
  """Test that complete shuffle works with multiple shards."""
  X = np.random.rand(100, 10)
  dataset = dc.data.DiskDataset.from_numpy(X)
  dataset.reshard(shard_size=10)
  shuffled = dataset.complete_shuffle()
  assert len(shuffled) == len(dataset)
  assert not np.array_equal(shuffled.ids, dataset.ids)
  assert sorted(shuffled.ids) == sorted(dataset.ids)
  assert shuffled.X.shape == dataset.X.shape
  assert shuffled.y.shape == dataset.y.shape
  assert shuffled.w.shape == dataset.w.shape


def test_complete_shuffle_multiple_shard_uneven():
  """Test that complete shuffle works with multiple shards and some shards not full size."""
  X = np.random.rand(57, 10)
  dataset = dc.data.DiskDataset.from_numpy(X)
  dataset.reshard(shard_size=10)
  shuffled = dataset.complete_shuffle()
  assert len(shuffled) == len(dataset)
  assert not np.array_equal(shuffled.ids, dataset.ids)
  assert sorted(shuffled.ids) == sorted(dataset.ids)
  assert shuffled.X.shape == dataset.X.shape
  assert shuffled.y.shape == dataset.y.shape
  assert shuffled.w.shape == dataset.w.shape


def test_complete_shuffle():
  """Test that complete shuffle."""
  current_dir = os.path.dirname(os.path.realpath(__file__))
@@ -22,8 +59,8 @@ def test_complete_shuffle():
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["log-solubility"]
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  dataset = loader.featurize(dataset_file, shard_size=2)
      tasks=tasks, feature_field="smiles", featurizer=featurizer)
  dataset = loader.create_dataset(dataset_file, shard_size=2)

  X_orig, y_orig, w_orig, orig_ids = (dataset.X, dataset.y, dataset.w,
                                      dataset.ids)
@@ -52,8 +89,8 @@ def test_sparse_shuffle():
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["log-solubility"]
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  dataset = loader.featurize(dataset_file, shard_size=2)
      tasks=tasks, feature_field="smiles", featurizer=featurizer)
  dataset = loader.create_dataset(dataset_file, shard_size=2)

  X_orig, y_orig, w_orig, orig_ids = (dataset.X, dataset.y, dataset.w,
                                      dataset.ids)
+1 −1
Original line number Diff line number Diff line
@@ -119,7 +119,7 @@ class GraphData:
    Returns
    -------
    dgl.DGLGraph
      Graph data for PyTorch Geometric
      Graph data for DGL

    Notes
    -----
+5 −1
Original line number Diff line number Diff line
@@ -10,7 +10,7 @@ import collections
import logging
from functools import reduce
from operator import mul
from typing import Dict, List, Optional
from typing import cast, Dict, List, Optional

from deepchem.data import Dataset
from deepchem.trans import Transformer
@@ -155,6 +155,8 @@ class GridHyperparamOpt(HyperparamOpt):

      evaluator = Evaluator(model, valid_dataset, output_transformers)
      multitask_scores = evaluator.compute_model_performance([metric])
      # NOTE: this casting is workaround. This line doesn't effect anything to the runtime
      multitask_scores = cast(Dict[str, float], multitask_scores)
      valid_score = multitask_scores[metric.name]
      hp_str = _convert_hyperparam_dict_to_filename(hyper_params)
      all_scores[hp_str] = valid_score
@@ -180,6 +182,8 @@ class GridHyperparamOpt(HyperparamOpt):
      return best_model, best_hyperparams, all_scores
    train_evaluator = Evaluator(best_model, train_dataset, output_transformers)
    multitask_scores = train_evaluator.compute_model_performance([metric])
    # NOTE: this casting is workaround. This line doesn't effect anything to the runtime
    multitask_scores = cast(Dict[str, float], multitask_scores)
    train_score = multitask_scores[metric.name]
    logger.info("Best hyperparameters: %s" % str(best_hyperparams))
    logger.info("train_score: %f" % train_score)
Loading