Commit 67095305 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Dataset changes

parent a8d420da
Loading
Loading
Loading
Loading
+330 −95
Original line number Diff line number Diff line
@@ -8,24 +8,34 @@ import deepchem as dc
import numpy as np
import pandas as pd
import random
from deepchem.utils.save import save_to_disk, save_metadata
from deepchem.utils.save import load_from_disk
from deepchem.utils.save import log
import logging
from pandas import read_hdf
import tempfile
import time
import shutil
import json
from multiprocessing.dummy import Pool
import warnings
from multiprocessing.dummy import Pool
from deepchem.utils.save import save_to_disk, save_metadata
from deepchem.utils.save import load_from_disk

__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"
logger = logging.getLogger(__name__)


def sparsify_features(X):
  """Extracts a sparse feature representation from dense feature array."""
  """Extracts a sparse feature representation from dense feature array.

  Parameters
  ----------
  X: np.ndarray
    Of shape `(n_samples, ...)
  
  Returns
  -------
  X_sparse, a np.ndarray with `dtype=object` where `X_sparse[i]` is a
  typle of `(nonzero_inds, nonzero_vals)` with nonzero indices and
  values in the i-th sample of `X`.
  """
  n_samples = len(X)
  X_sparse = []
  for i in range(n_samples):
@@ -37,7 +47,24 @@ def sparsify_features(X):


def densify_features(X_sparse, num_features):
  """Expands sparse feature representation to dense feature array."""
  """Expands sparse feature representation to dense feature array.

  Assumes that the sparse representation was constructed from an array
  which had original shape `(n_samples, num_features)` so doesn't
  support reconstructing multidimensional dense arrays.

  Parameters
  ----------
  X_sparse: np.ndarray
    Must have `dtype=object`. `X_sparse[i]` must be a tuple of nonzero
    indices and values.
  num_features: int
    Number of features in dense array.

  Returns
  -------
  X, a np.ndarray of shape `(n_samples, num_features)`.
  """
  n_samples = len(X_sparse)
  X = np.zeros((n_samples, num_features))
  for i in range(n_samples):
@@ -49,10 +76,38 @@ def densify_features(X_sparse, num_features):
def pad_features(batch_size, X_b):
  """Pads a batch of features to have precisely batch_size elements.

  Version of pad_batch for use at prediction time.
  Given an array of features with length less than or equal to
  batch-size, pads it to `batch_size` length. It does this by
  repeating the original features in tiled fashion. For illustration,
  suppose that `len(X_b) == 3` and `batch_size == 10`.

  >>> X_b = np.arange(3)
  >>> X_b
  array([0, 1, 2])
  >>> batch_size = 10
  >>> X_manual = np.array([0, 1, 2, 0, 1, 2, 0, 1, 2, 0])
  >>> X_out = pad_features(batch_size, X_b)
  >>> assert (X_manual == X_out).all() 

  This function is similar to `pad_batch` but doesn't handle labels
  `y` or weights `w` and is intended to be used for inference-time
  query processing.

  Parameters
  ----------
  batch_size: int
    The number of datapoints in a batch
  X_b: np.ndarray
    Must be such that `len(X_b) <= batch_size`

  Returns
  -------
  X_out, a np.ndarray with `len(X_out) == batch_size`.
  """
  num_samples = len(X_b)
  if num_samples == batch_size:
  if num_samples > batch_size:
    raise ValueError("Cannot pad an array longer than `batch_size`")
  elif num_samples == batch_size:
    return X_b
  else:
    # By invariant of when this is called, can assume num_samples > 0
@@ -79,7 +134,28 @@ def pad_features(batch_size, X_b):
def pad_batch(batch_size, X_b, y_b, w_b, ids_b):
  """Pads batch to have size precisely batch_size elements.

  Fills in batch by wrapping around samples till whole batch is filled.
  Given arrays of features `X_b`, labels `y_b`, weights `w_b`, and
  identifiers `ids_b` all with length less than or equal to
  batch-size, pads them to `batch_size` length. It does this by
  repeating the original entries in tiled fashion. Note that `X_b,
  y_b, w_b, ids_b` must all have the same length.

  Parameters
  ----------
  batch_size: int
    The number of datapoints in a batch
  X_b: np.ndarray
    Must be such that `len(X_b) <= batch_size`
  y_b: np.ndarray
    Must be such that `len(y_b) <= batch_size`
  w_b: np.ndarray
    Must be such that `len(w_b) <= batch_size`
  ids_b: np.ndarray
    Must be such that `len(ids_b) <= batch_size`

  Returns
  -------
  (X_out, y_out, w_out, ids_out), all np.ndarray with length `batch_size`.
  """
  num_samples = len(X_b)
  if num_samples == batch_size:
@@ -132,7 +208,22 @@ def pad_batch(batch_size, X_b, y_b, w_b, ids_b):


class Dataset(object):
  """Abstract base class for datasets defined by X, y, w elements."""
  """Abstract base class for datasets defined by X, y, w elements.

  `Dataset` objects are used to store representations of a dataset as
  used in a machine learning task. Datasets contain features `X`,
  labels `y`, weights `w` and identifiers `ids`. Different subclasses
  of `Dataset` may choose to hold `X, y, w, ids` in memory or on disk.

  The `Dataset` class attempts to provide for strong interoperability
  with other machine learning representations for datasets.
  Interconversion methods allow for `Dataset` objects to be converted
  to and from pandas dataframes, tensorflow datasets, and pytorch
  datasets (only to and not from for pytorch at present).

  Note that you can never instantiate a `Dataset` object directly.
  Instead you will need to instantiate one of the concrete subclasses.
  """

  def __init__(self):
    raise NotImplementedError()
@@ -146,7 +237,8 @@ class Dataset(object):
  def get_shape(self):
    """Get the shape of the dataset.

    Returns four tuples, giving the shape of the X, y, w, and ids arrays.
    Returns four tuples, giving the shape of the X, y, w, and ids
    arrays.
    """
    raise NotImplementedError()

@@ -156,43 +248,83 @@ class Dataset(object):

  @property
  def X(self):
    """Get the X vector for this dataset as a single numpy array."""
    """Get the X vector for this dataset as a single numpy array.

    Returns
    -------
    Numpy array of features `X`.
    """
    raise NotImplementedError()

  @property
  def y(self):
    """Get the y vector for this dataset as a single numpy array."""
    """Get the y vector for this dataset as a single numpy array.

    Returns
    -------
    Numpy array of labels `y`.
    """
    raise NotImplementedError()

  @property
  def ids(self):
    """Get the ids vector for this dataset as a single numpy array."""
    """Get the ids vector for this dataset as a single numpy array.

    Returns
    -------
    Numpy array of identifiers `ids`.
    """

    raise NotImplementedError()

  @property
  def w(self):
    """Get the weight vector for this dataset as a single numpy array."""
    """Get the weight vector for this dataset as a single numpy array.

    Returns
    -------
    Numpy array of weights `w`.
    """
    raise NotImplementedError()

  def __repr__(self):
    """Convert self to REPL print representation."""
    threshold = dc.utils.get_print_threshold()
    id_str = np.array2string(self.ids, threshold=threshold)
    task_str = np.array2string(
        np.array(self.get_task_names()), threshold=threshold)
    return "<%s X.shape: %s, y.shape: %s, w.shape: %s, ids: %s, task_names: %s>" % (
        self.__class__.__name__, str(self.X.shape), str(self.y.shape),
        str(self.w.shape), id_str, task_str)

  def __str__(self):
    """Convert self to str representation."""
    return self.__repr__()

  def iterbatches(self,
                  batch_size=None,
                  epoch=0,
                  deterministic=False,
                  pad_batches=False):
    """
    """Get an object that iterates over minibatches from the dataset.

    Each minibatch is returned as a tuple of four numpy arrays: `(X,
    y, w, ids)`.

    Parameters
    ----------

    batch_size: int, optional
      Number of elements in each batch
    epoch: int, optional
      Number of epochs to walk over dataset
    deterministic: bool, optional
      If True, follow deterministic order.
    pad_batches: bool, optional
      If True, pad each batch to `batch_size`.

    Returns
    -------

    """
    """Get an object that iterates over minibatches from the dataset.

    Each minibatch is returned as a tuple of four numpy arrays: (X, y, w, ids).
    Generator which yields tuples of four numpy arrays `(X, y, w, ids)`
    """
    raise NotImplementedError()

@@ -216,9 +348,9 @@ class Dataset(object):

    >> newx, newy, neww = fn(x, y, w)

    It might be called only once with the whole dataset, or multiple times with
    different subsets of the data.  Each time it is called, it should transform
    the samples and return the transformed data.
    It might be called only once with the whole dataset, or multiple
    times with different subsets of the data.  Each time it is called,
    it should transform the samples and return the transformed data.

    Parameters
    ----------
@@ -232,7 +364,25 @@ class Dataset(object):
    raise NotImplementedError()

  def get_statistics(self, X_stats=True, y_stats=True):
    """Compute and return statistics of this dataset."""
    """Compute and return statistics of this dataset.

    Uses `self.itersamples()` to compute means and standard deviations
    of the dataset. Can compute on large datasets that don't fit in
    memory.

    Parameters
    ----------
    X_stats: bool, optional
      If True, compute feature-level mean and standard deviations.
    y_stats: bool, optional
      If True, compute label-level mean and standard deviations.

    Returns
    -------
    If `X_stats == True`, returns `(X_means, X_stds)`. If `y_stats ==
    True`, returns `(y_means, y_stds)`. If both are true, returns
    `(X_means, X_stds, y_means, y_stds)`.
    """
    X_means = 0.0
    X_m2 = 0.0
    y_means = 0.0
@@ -270,8 +420,8 @@ class Dataset(object):
                      pad_batches=False):
    """Create a tf.data.Dataset that iterates over the data in this Dataset.

    Each value returned by the Dataset's iterator is a tuple of (X, y, w) for
    one batch.
    Each value returned by the Dataset's iterator is a tuple of (X, y,
    w) for one batch.

    Parameters
    ----------
@@ -280,11 +430,15 @@ class Dataset(object):
    epochs: int
      the number of times to iterate over the Dataset
    deterministic: bool
      if True, the data is produced in order.  If False, a different random
      permutation of the data is used for each epoch.
      if True, the data is produced in order.  If False, a different
      random permutation of the data is used for each epoch.
    pad_batches: bool
      if True, batches are padded as necessary to make the size of each batch
      exactly equal batch_size.
      if True, batches are padded as necessary to make the size of
      each batch exactly equal batch_size.

    Returns
    -------
    tf.Dataset that iterates over the same data.
    """
    # Retrieve the first sample so we can determine the dtypes.

@@ -308,21 +462,33 @@ class Dataset(object):
  def make_pytorch_dataset(self, epochs=1, deterministic=False):
    """Create a torch.utils.data.IterableDataset that iterates over the data in this Dataset.

    Each value returned by the Dataset's iterator is a tuple of (X, y, w, id) for
    one sample.
    Each value returned by the Dataset's iterator is a tuple of (X, y,
    w, id) for one sample.

    Parameters
    ----------
    epochs: int
      the number of times to iterate over the Dataset
    deterministic: bool
      if True, the data is produced in order.  If False, a different random
      permutation of the data is used for each epoch.
      if True, the data is produced in order.  If False, a different
      random permutation of the data is used for each epoch.

    Returns
    -------
    `torch.utils.data.IterableDataset` that iterates over the data in
    this dataset.
    """
    raise NotImplementedError()

  def to_dataframe(self):
    """Construct a pandas DataFrame containing the data from this Dataset."""
    """Construct a pandas DataFrame containing the data from this Dataset.

    Returns
    -------
    pandas datafarme. Will have column "X1,X2,..." for features,
    "y1,y2,..." for labels, "w1,w2,..." for weights, and column "ids"
    for identifiers.
    """
    X = self.X
    y = self.y
    w = self.w
@@ -354,17 +520,21 @@ class Dataset(object):
    df: DataFrame
      the pandas DataFrame
    X: string or list of strings
      the name of the column or columns containing the X array.  If this is None,
      it will look for default column names that match those produced by to_dataframe().
      the name of the column or columns containing the X array.  If
      this is None, it will look for default column names that match
      those produced by to_dataframe().
    y: string or list of strings
      the name of the column or columns containing the y array.  If this is None,
      it will look for default column names that match those produced by to_dataframe().
      the name of the column or columns containing the y array.  If
      this is None, it will look for default column names that match
      those produced by to_dataframe().
    w: string or list of strings
      the name of the column or columns containing the w array.  If this is None,
      it will look for default column names that match those produced by to_dataframe().
      the name of the column or columns containing the w array.  If
      this is None, it will look for default column names that match
      those produced by to_dataframe().
    ids: string
      the name of the column containing the ids.  If this is None, it will look
      for default column names that match those produced by to_dataframe().
      the name of the column containing the ids.  If this is None, it
      will look for default column names that match those produced by
      to_dataframe().
    """
    # Find the X values.

@@ -426,9 +596,32 @@ class Dataset(object):


class NumpyDataset(Dataset):
  """A Dataset defined by in-memory numpy arrays."""
  """A Dataset defined by in-memory numpy arrays.

  This subclass of `Dataset` stores arrays `X,y,w,ids` in memory as
  numpy arrays. This makes it very easy to construct `NumpyDataset`
  objects. For example

  >>> import numpy as np
  >>> NumpyDataset(X=np.random.rand(5, 3), y=np.random.rand(5,), ids=np.arange(5))
  """

  def __init__(self, X, y=None, w=None, ids=None, n_tasks=1):
    """Initialize this object.

    Parameters
    ----------
    X: np.ndarray
      Input features. Of shape `(n_samples,...)`
    y: np.ndarray, optional
      Labels. Of shape `(n_samples, n_tasks)` typically.
    w: np.ndarray, optional
      Weights. Of same shape as `y`.
    ids: np.ndarray, optional
      Identifiers. Of shape `(n_samples,)`
    n_tasks: int, optional
      Number of learning tasks.
    """
    n_samples = len(X)
    if n_samples > 0:
      if y is None:
@@ -462,7 +655,8 @@ class NumpyDataset(Dataset):
  def get_shape(self):
    """Get the shape of the dataset.

    Returns four tuples, giving the shape of the X, y, w, and ids arrays.
    Returns four tuples, giving the shape of the X, y, w, and ids
    arrays.
    """
    return self._X.shape, self._y.shape, self._w.shape, self._ids.shape

@@ -499,7 +693,23 @@ class NumpyDataset(Dataset):
                  pad_batches=False):
    """Get an object that iterates over minibatches from the dataset.

    Each minibatch is returned as a tuple of four numpy arrays: (X, y, w, ids).
    Each minibatch is returned as a tuple of four numpy arrays: (X, y,
    w, ids).

    Parameters
    ----------
    batch_size: int, optional
      Number of elements in each batch
    epoch: int, optional
      Number of epochs to walk over dataset
    deterministic: bool, optional
      If True, follow deterministic order.
    pad_batches: bool, optional
      If True, pad each batch to `batch_size`.

    Returns
    -------
    Generator which yields tuples of four numpy arrays `(X, y, w, ids)`
    """

    def iterate(dataset, batch_size, deterministic, pad_batches):
@@ -551,9 +761,9 @@ class NumpyDataset(Dataset):

    >> newx, newy, neww = fn(x, y, w)

    It might be called only once with the whole dataset, or multiple times with
    different subsets of the data.  Each time it is called, it should transform
    the samples and return the transformed data.
    It might be called only once with the whole dataset, or multiple
    times with different subsets of the data.  Each time it is called,
    it should transform the samples and return the transformed data.

    Parameters
    ----------
@@ -692,18 +902,17 @@ class DiskDataset(Dataset):
  A Dataset that is stored as a set of files on disk.
  """

  def __init__(self, data_dir, verbose=True):
  def __init__(self, data_dir):
    """
    Turns featurized dataframes into numpy files, writes them & metadata to disk.
    """
    self.data_dir = data_dir
    self.verbose = verbose

    log("Loading dataset from disk.", self.verbose)
    logger.info("Loading dataset from disk.")
    self.tasks, self.metadata_df = self.load_metadata()

  @staticmethod
  def create_dataset(shard_generator, data_dir=None, tasks=[], verbose=True):
  def create_dataset(shard_generator, data_dir=None, tasks=[]):
    """Creates a new DiskDataset

    Parameters
@@ -731,8 +940,8 @@ class DiskDataset(Dataset):
    metadata_df = DiskDataset._construct_metadata(metadata_rows)
    save_metadata(tasks, metadata_df, data_dir)
    time2 = time.time()
    log("TIMING: dataset construction took %0.3f s" % (time2 - time1), verbose)
    return DiskDataset(data_dir, verbose=verbose)
    logger.info("TIMING: dataset construction took %0.3f s" % (time2 - time1))
    return DiskDataset(data_dir)

  def load_metadata(self):
    try:
@@ -1072,7 +1281,7 @@ class DiskDataset(Dataset):

    Example:

    >>> dataset = DiskDataset.from_numpy(np.ones((2,2)), np.ones((2,1)), verbose=False)
    >>> dataset = DiskDataset.from_numpy(np.ones((2,2)), np.ones((2,1)))
    >>> for x, y, w, id in dataset.itersamples():
    ...   print(x.tolist(), y.tolist(), w.tolist(), id)
    [1.0, 1.0] [1.0] [1.0] 0
@@ -1123,11 +1332,6 @@ class DiskDataset(Dataset):
      out_dir = tempfile.mkdtemp()
    tasks = self.get_task_names()

    if 'verbose' in args:
      verbose = args['verbose']
    else:
      verbose = True

    def generator():
      for shard_num, row in self.metadata_df.iterrows():
        X, y, w, ids = self.get_shard(shard_num)
@@ -1135,7 +1339,7 @@ class DiskDataset(Dataset):
        yield (newx, newy, neww, ids)

    return DiskDataset.create_dataset(
        generator(), data_dir=out_dir, tasks=tasks, verbose=verbose)
        generator(), data_dir=out_dir, tasks=tasks)

  def make_pytorch_dataset(self, epochs=1, deterministic=False):
    """Create a torch.utils.data.IterableDataset that iterates over the data in this Dataset.
@@ -1179,13 +1383,7 @@ class DiskDataset(Dataset):
    return TorchDataset()

  @staticmethod
  def from_numpy(X,
                 y=None,
                 w=None,
                 ids=None,
                 tasks=None,
                 data_dir=None,
                 verbose=True):
  def from_numpy(X, y=None, w=None, ids=None, tasks=None, data_dir=None):
    """Creates a DiskDataset object from specified Numpy arrays."""
    n_samples = len(X)
    if ids is None:
@@ -1218,7 +1416,7 @@ class DiskDataset(Dataset):

    # raw_data = (X, y, w, ids)
    return DiskDataset.create_dataset(
        [(X, y, w, ids)], data_dir=data_dir, tasks=tasks, verbose=verbose)
        [(X, y, w, ids)], data_dir=data_dir, tasks=tasks)

  @staticmethod
  def merge(datasets, merge_dir=None):
@@ -1306,7 +1504,7 @@ class DiskDataset(Dataset):
      X_s = densify_features(X_sparse_s, num_features)
      self.set_shard(i, X_s, y_s, w_s, ids_s)
    time2 = time.time()
    log("TIMING: sparse_shuffle took %0.3f s" % (time2 - time1), self.verbose)
    logger.info("TIMING: sparse_shuffle took %0.3f s" % (time2 - time1))

  def complete_shuffle(self, data_dir=None):
    """
@@ -1323,7 +1521,7 @@ class DiskDataset(Dataset):

    Returns
    -------
    DiskDatasset
    DiskDataset
      A DiskDataset with a single shard.

    """
@@ -1425,10 +1623,11 @@ class DiskDataset(Dataset):

    Parameters
    ----------
    select_dir: string
      Path to new directory that the selected indices will be copied to.
    indices: list
      List of indices to select.
    select_dir: string
      Path to new directory that the selected indices will be copied
      to.
    """
    if select_dir is not None:
      if not os.path.exists(select_dir):
@@ -1437,8 +1636,7 @@ class DiskDataset(Dataset):
      select_dir = tempfile.mkdtemp()
    # Handle edge case with empty indices
    if not len(indices):
      return DiskDataset.create_dataset(
          [], data_dir=select_dir, verbose=self.verbose)
      return DiskDataset.create_dataset([], data_dir=select_dir)
    indices = np.array(sorted(indices)).astype(int)
    tasks = self.get_task_names()

@@ -1475,7 +1673,7 @@ class DiskDataset(Dataset):
          return

    return DiskDataset.create_dataset(
        generator(), data_dir=select_dir, tasks=tasks, verbose=self.verbose)
        generator(), data_dir=select_dir, tasks=tasks)

  @property
  def ids(self):
@@ -1751,15 +1949,13 @@ class ImageDataset(Dataset):
  def select(self, indices, select_dir=None):
    """Creates a new dataset from a selection of indices from self.

    TODO(rbharath): select_dir is here due to dc.splits always passing in
    splits.

    Parameters
    ----------
    indices: list
      List of indices to select.
    select_dir: string
      Ignored.
      Used to provide same API as `DiskDataset`. Ignored since
      `NumpYDataset` is purely in-memory.
    """
    if isinstance(self._X, np.ndarray):
      X = self._X[indices]
@@ -1776,16 +1972,21 @@ class ImageDataset(Dataset):
  def make_pytorch_dataset(self, epochs=1, deterministic=False):
    """Create a torch.utils.data.IterableDataset that iterates over the data in this Dataset.

    Each value returned by the Dataset's iterator is a tuple of (X, y, w, id) for
    one sample.
    Each value returned by the Dataset's iterator is a tuple of (X, y,
    w, id) for one sample.

    Parameters
    ----------
    epochs: int
      the number of times to iterate over the Dataset
    deterministic: bool
      if True, the data is produced in order.  If False, a different random
      permutation of the data is used for each epoch.
      if True, the data is produced in order.  If False, a different
      random permutation of the data is used for each epoch.

    Returns
    -------
    `torch.utils.data.IterableDataset` iterating over the same data as
    this dataset.
    """
    import torch

@@ -1823,22 +2024,57 @@ class ImageDataset(Dataset):


class Databag(object):
  """
  A utility class to iterate through multiple datasets together.
  """A utility class to iterate through multiple datasets together.


  A `Databag` is useful when you have multiple datasets that you want
  to iterate in locksteps. This might be easiest to grasp with a
  simple code example.

  >>> ones_dataset = NumpyDataset(X=np.ones((5, 3)))        
  >>> zeros_dataset = NumpyDataset(X=np.zeros((5, 3)))      
  >>> databag = Databag({"ones": ones_dataset, "zeros": zeros_dataset}) 
  >>> for sample_dict in databag.iterbatches(batch_size=1): 
  ...   print(sample_dict) 
  {'ones': array([[1., 1., 1.]]), 'zeros': array([[0., 0., 0.]])}
  {'ones': array([[1., 1., 1.]]), 'zeros': array([[0., 0., 0.]])}
  {'ones': array([[1., 1., 1.]]), 'zeros': array([[0., 0., 0.]])}
  {'ones': array([[1., 1., 1.]]), 'zeros': array([[0., 0., 0.]])}
  {'ones': array([[1., 1., 1.]]), 'zeros': array([[0., 0., 0.]])}

  Note how we get a batch at a time from each of the datasets in the
  `Databag`. This can be useful for training models that combine data
  from multiple `Dataset` objects at a time.
  """

  def __init__(self, datasets=None):
    """Initialize this `Databag`.

    Parameters
    ----------
    datasets: dict, optional
      A dictionary mapping keys to `Dataset` objects.
    """
    if datasets is None:
      self.datasets = dict()
    else:
      self.datasets = datasets

  def add_dataset(self, key, dataset):
    """Adds a dataset to this databag.

    Parameters
    ----------
    key: hashable value
      Key to be added
    dataset: `Dataset`
      The dataset that `key` should point to.
    """
    self.datasets[key] = dataset

  def iterbatches(self, **kwargs):
    """
    Loop through all internal datasets in the same order
    """Loop through all internal datasets in the same order.

    Parameters
    ----------
    batch_size: int
@@ -1851,7 +2087,6 @@ class Databag(object):
    Returns
    -------
    Generator which yields a dictionary {key: dataset.X[batch]}

    """
    key_order = [x for x in self.datasets.keys()]
    if "epochs" in kwargs:
+36 −3
Original line number Diff line number Diff line
@@ -62,6 +62,39 @@ def get_data_dir():
  return tempfile.gettempdir()


# The number of elements to print for dataset ids/tasks
_print_threshold = 10


def get_print_threshold():
  """Return the printing threshold for array.

  The print threshold is the number of elements from ids/tasks to
  print when printing representations of `Dataset` objects. 

  Returns 
  ----------
  threshold: int
    Number of elements that will be printed
  """
  return _print_threshold


def set_print_threshold(threshold):
  """Set print threshold

  The print threshold is the number of elements from ids/tasks to
  print when printing representations of `Dataset` objects. 

  Parameters
  ----------
  threshold: int
    Number of elements to print.
  """
  global _print_threshold
  _print_threshold = threshold


def download_url(url, dest_dir=get_data_dir(), name=None):
  """Download a file to disk.

@@ -140,9 +173,9 @@ class ScaffoldGenerator(object):
    """
    Get Murcko scaffolds for molecules.

    Murcko scaffolds are described in DOI: 10.1021/jm9602928. They are
    essentially that part of the molecule consisting of rings and the
    linker atoms between them.
    Murcko scaffolds are described in DOI: 10.1021/jm9602928.
    They are essentially that part of the molecule consisting of
    rings and the linker atoms between them.

    Parameters
    ----------