Dataset changes (67095305) · Commits · 钟慕尧 / deepchem

deepchem/data/datasets.py

+330 −95

Original line number	Diff line number	Diff line
		@@ -8,24 +8,34 @@ import deepchem as dc
		import numpy as np
		import pandas as pd
		import random
		from deepchem.utils.save import save_to_disk, save_metadata
		from deepchem.utils.save import load_from_disk
		from deepchem.utils.save import log
		import logging
		from pandas import read_hdf
		import tempfile
		import time
		import shutil
		import json
		from multiprocessing.dummy import Pool
		import warnings
		from multiprocessing.dummy import Pool
		from deepchem.utils.save import save_to_disk, save_metadata
		from deepchem.utils.save import load_from_disk

		__author__ = "Bharath Ramsundar"
		__copyright__ = "Copyright 2016, Stanford University"
		__license__ = "MIT"
		logger = logging.getLogger(__name__)


		def sparsify_features(X):
		"""Extracts a sparse feature representation from dense feature array."""
		"""Extracts a sparse feature representation from dense feature array.

		Parameters
		----------
		X: np.ndarray
		Of shape `(n_samples, ...)

		Returns
		-------
		X_sparse, a np.ndarray with `dtype=object` where `X_sparse[i]` is a
		typle of `(nonzero_inds, nonzero_vals)` with nonzero indices and
		values in the i-th sample of `X`.
		"""
		n_samples = len(X)
		X_sparse = []
		for i in range(n_samples):
		@@ -37,7 +47,24 @@ def sparsify_features(X):


		def densify_features(X_sparse, num_features):
		"""Expands sparse feature representation to dense feature array."""
		"""Expands sparse feature representation to dense feature array.

		Assumes that the sparse representation was constructed from an array
		which had original shape `(n_samples, num_features)` so doesn't
		support reconstructing multidimensional dense arrays.

		Parameters
		----------
		X_sparse: np.ndarray
		Must have `dtype=object`. `X_sparse[i]` must be a tuple of nonzero
		indices and values.
		num_features: int
		Number of features in dense array.

		Returns
		-------
		X, a np.ndarray of shape `(n_samples, num_features)`.
		"""
		n_samples = len(X_sparse)
		X = np.zeros((n_samples, num_features))
		for i in range(n_samples):
		@@ -49,10 +76,38 @@ def densify_features(X_sparse, num_features):
		def pad_features(batch_size, X_b):
		"""Pads a batch of features to have precisely batch_size elements.

		Version of pad_batch for use at prediction time.
		Given an array of features with length less than or equal to
		batch-size, pads it to `batch_size` length. It does this by
		repeating the original features in tiled fashion. For illustration,
		suppose that `len(X_b) == 3` and `batch_size == 10`.

		>>> X_b = np.arange(3)
		>>> X_b
		array([0, 1, 2])
		>>> batch_size = 10
		>>> X_manual = np.array([0, 1, 2, 0, 1, 2, 0, 1, 2, 0])
		>>> X_out = pad_features(batch_size, X_b)
		>>> assert (X_manual == X_out).all()

		This function is similar to `pad_batch` but doesn't handle labels
		`y` or weights `w` and is intended to be used for inference-time
		query processing.

		Parameters
		----------
		batch_size: int
		The number of datapoints in a batch
		X_b: np.ndarray
		Must be such that `len(X_b) <= batch_size`

		Returns
		-------
		X_out, a np.ndarray with `len(X_out) == batch_size`.
		"""
		num_samples = len(X_b)
		if num_samples == batch_size:
		if num_samples > batch_size:
		raise ValueError("Cannot pad an array longer than `batch_size`")
		elif num_samples == batch_size:
		return X_b
		else:
		# By invariant of when this is called, can assume num_samples > 0
		@@ -79,7 +134,28 @@ def pad_features(batch_size, X_b):
		def pad_batch(batch_size, X_b, y_b, w_b, ids_b):
		"""Pads batch to have size precisely batch_size elements.

		Fills in batch by wrapping around samples till whole batch is filled.
		Given arrays of features `X_b`, labels `y_b`, weights `w_b`, and
		identifiers `ids_b` all with length less than or equal to
		batch-size, pads them to `batch_size` length. It does this by
		repeating the original entries in tiled fashion. Note that `X_b,
		y_b, w_b, ids_b` must all have the same length.

		Parameters
		----------
		batch_size: int
		The number of datapoints in a batch
		X_b: np.ndarray
		Must be such that `len(X_b) <= batch_size`
		y_b: np.ndarray
		Must be such that `len(y_b) <= batch_size`
		w_b: np.ndarray
		Must be such that `len(w_b) <= batch_size`
		ids_b: np.ndarray
		Must be such that `len(ids_b) <= batch_size`

		Returns
		-------
		(X_out, y_out, w_out, ids_out), all np.ndarray with length `batch_size`.
		"""
		num_samples = len(X_b)
		if num_samples == batch_size:
		@@ -132,7 +208,22 @@ def pad_batch(batch_size, X_b, y_b, w_b, ids_b):


		class Dataset(object):
		"""Abstract base class for datasets defined by X, y, w elements."""
		"""Abstract base class for datasets defined by X, y, w elements.

		`Dataset` objects are used to store representations of a dataset as
		used in a machine learning task. Datasets contain features `X`,
		labels `y`, weights `w` and identifiers `ids`. Different subclasses
		of `Dataset` may choose to hold `X, y, w, ids` in memory or on disk.

		The `Dataset` class attempts to provide for strong interoperability
		with other machine learning representations for datasets.
		Interconversion methods allow for `Dataset` objects to be converted
		to and from pandas dataframes, tensorflow datasets, and pytorch
		datasets (only to and not from for pytorch at present).

		Note that you can never instantiate a `Dataset` object directly.
		Instead you will need to instantiate one of the concrete subclasses.
		"""

		def __init__(self):
		raise NotImplementedError()
		@@ -146,7 +237,8 @@ class Dataset(object):
		def get_shape(self):
		"""Get the shape of the dataset.

		Returns four tuples, giving the shape of the X, y, w, and ids arrays.
		Returns four tuples, giving the shape of the X, y, w, and ids
		arrays.
		"""
		raise NotImplementedError()

		@@ -156,43 +248,83 @@ class Dataset(object):

		@property
		def X(self):
		"""Get the X vector for this dataset as a single numpy array."""
		"""Get the X vector for this dataset as a single numpy array.

		Returns
		-------
		Numpy array of features `X`.
		"""
		raise NotImplementedError()

		@property
		def y(self):
		"""Get the y vector for this dataset as a single numpy array."""
		"""Get the y vector for this dataset as a single numpy array.

		Returns
		-------
		Numpy array of labels `y`.
		"""
		raise NotImplementedError()

		@property
		def ids(self):
		"""Get the ids vector for this dataset as a single numpy array."""
		"""Get the ids vector for this dataset as a single numpy array.

		Returns
		-------
		Numpy array of identifiers `ids`.
		"""

		raise NotImplementedError()

		@property
		def w(self):
		"""Get the weight vector for this dataset as a single numpy array."""
		"""Get the weight vector for this dataset as a single numpy array.

		Returns
		-------
		Numpy array of weights `w`.
		"""
		raise NotImplementedError()

		def __repr__(self):
		"""Convert self to REPL print representation."""
		threshold = dc.utils.get_print_threshold()
		id_str = np.array2string(self.ids, threshold=threshold)
		task_str = np.array2string(
		np.array(self.get_task_names()), threshold=threshold)
		return "<%s X.shape: %s, y.shape: %s, w.shape: %s, ids: %s, task_names: %s>" % (
		self.__class__.__name__, str(self.X.shape), str(self.y.shape),
		str(self.w.shape), id_str, task_str)

		def __str__(self):
		"""Convert self to str representation."""
		return self.__repr__()

		def iterbatches(self,
		batch_size=None,
		epoch=0,
		deterministic=False,
		pad_batches=False):
		"""
		"""Get an object that iterates over minibatches from the dataset.

		Each minibatch is returned as a tuple of four numpy arrays: `(X,
		y, w, ids)`.

		Parameters
		----------

		batch_size: int, optional
		Number of elements in each batch
		epoch: int, optional
		Number of epochs to walk over dataset
		deterministic: bool, optional
		If True, follow deterministic order.
		pad_batches: bool, optional
		If True, pad each batch to `batch_size`.

		Returns
		-------

		"""
		"""Get an object that iterates over minibatches from the dataset.

		Each minibatch is returned as a tuple of four numpy arrays: (X, y, w, ids).
		Generator which yields tuples of four numpy arrays `(X, y, w, ids)`
		"""
		raise NotImplementedError()

		@@ -216,9 +348,9 @@ class Dataset(object):

		>> newx, newy, neww = fn(x, y, w)

		It might be called only once with the whole dataset, or multiple times with
		different subsets of the data. Each time it is called, it should transform
		the samples and return the transformed data.
		It might be called only once with the whole dataset, or multiple
		times with different subsets of the data. Each time it is called,
		it should transform the samples and return the transformed data.

		Parameters
		----------
		@@ -232,7 +364,25 @@ class Dataset(object):
		raise NotImplementedError()

		def get_statistics(self, X_stats=True, y_stats=True):
		"""Compute and return statistics of this dataset."""
		"""Compute and return statistics of this dataset.

		Uses `self.itersamples()` to compute means and standard deviations
		of the dataset. Can compute on large datasets that don't fit in
		memory.

		Parameters
		----------
		X_stats: bool, optional
		If True, compute feature-level mean and standard deviations.
		y_stats: bool, optional
		If True, compute label-level mean and standard deviations.

		Returns
		-------
		If `X_stats == True`, returns `(X_means, X_stds)`. If `y_stats ==
		True`, returns `(y_means, y_stds)`. If both are true, returns
		`(X_means, X_stds, y_means, y_stds)`.
		"""
		X_means = 0.0
		X_m2 = 0.0
		y_means = 0.0
		@@ -270,8 +420,8 @@ class Dataset(object):
		pad_batches=False):
		"""Create a tf.data.Dataset that iterates over the data in this Dataset.

		Each value returned by the Dataset's iterator is a tuple of (X, y, w) for
		one batch.
		Each value returned by the Dataset's iterator is a tuple of (X, y,
		w) for one batch.

		Parameters
		----------
		@@ -280,11 +430,15 @@ class Dataset(object):
		epochs: int
		the number of times to iterate over the Dataset
		deterministic: bool
		if True, the data is produced in order. If False, a different random
		permutation of the data is used for each epoch.
		if True, the data is produced in order. If False, a different
		random permutation of the data is used for each epoch.
		pad_batches: bool
		if True, batches are padded as necessary to make the size of each batch
		exactly equal batch_size.
		if True, batches are padded as necessary to make the size of
		each batch exactly equal batch_size.

		Returns
		-------
		tf.Dataset that iterates over the same data.
		"""
		# Retrieve the first sample so we can determine the dtypes.

		@@ -308,21 +462,33 @@ class Dataset(object):
		def make_pytorch_dataset(self, epochs=1, deterministic=False):
		"""Create a torch.utils.data.IterableDataset that iterates over the data in this Dataset.

		Each value returned by the Dataset's iterator is a tuple of (X, y, w, id) for
		one sample.
		Each value returned by the Dataset's iterator is a tuple of (X, y,
		w, id) for one sample.

		Parameters
		----------
		epochs: int
		the number of times to iterate over the Dataset
		deterministic: bool
		if True, the data is produced in order. If False, a different random
		permutation of the data is used for each epoch.
		if True, the data is produced in order. If False, a different
		random permutation of the data is used for each epoch.

		Returns
		-------
		`torch.utils.data.IterableDataset` that iterates over the data in
		this dataset.
		"""
		raise NotImplementedError()

		def to_dataframe(self):
		"""Construct a pandas DataFrame containing the data from this Dataset."""
		"""Construct a pandas DataFrame containing the data from this Dataset.

		Returns
		-------
		pandas datafarme. Will have column "X1,X2,..." for features,
		"y1,y2,..." for labels, "w1,w2,..." for weights, and column "ids"
		for identifiers.
		"""
		X = self.X
		y = self.y
		w = self.w
		@@ -354,17 +520,21 @@ class Dataset(object):
		df: DataFrame
		the pandas DataFrame
		X: string or list of strings
		the name of the column or columns containing the X array. If this is None,
		it will look for default column names that match those produced by to_dataframe().
		the name of the column or columns containing the X array. If
		this is None, it will look for default column names that match
		those produced by to_dataframe().
		y: string or list of strings
		the name of the column or columns containing the y array. If this is None,
		it will look for default column names that match those produced by to_dataframe().
		the name of the column or columns containing the y array. If
		this is None, it will look for default column names that match
		those produced by to_dataframe().
		w: string or list of strings
		the name of the column or columns containing the w array. If this is None,
		it will look for default column names that match those produced by to_dataframe().
		the name of the column or columns containing the w array. If
		this is None, it will look for default column names that match
		those produced by to_dataframe().
		ids: string
		the name of the column containing the ids. If this is None, it will look
		for default column names that match those produced by to_dataframe().
		the name of the column containing the ids. If this is None, it
		will look for default column names that match those produced by
		to_dataframe().
		"""
		# Find the X values.

		@@ -426,9 +596,32 @@ class Dataset(object):


		class NumpyDataset(Dataset):
		"""A Dataset defined by in-memory numpy arrays."""
		"""A Dataset defined by in-memory numpy arrays.

		This subclass of `Dataset` stores arrays `X,y,w,ids` in memory as
		numpy arrays. This makes it very easy to construct `NumpyDataset`
		objects. For example

		>>> import numpy as np
		>>> NumpyDataset(X=np.random.rand(5, 3), y=np.random.rand(5,), ids=np.arange(5))
		"""

		def __init__(self, X, y=None, w=None, ids=None, n_tasks=1):
		"""Initialize this object.

		Parameters
		----------
		X: np.ndarray
		Input features. Of shape `(n_samples,...)`
		y: np.ndarray, optional
		Labels. Of shape `(n_samples, n_tasks)` typically.
		w: np.ndarray, optional
		Weights. Of same shape as `y`.
		ids: np.ndarray, optional
		Identifiers. Of shape `(n_samples,)`
		n_tasks: int, optional
		Number of learning tasks.
		"""
		n_samples = len(X)
		if n_samples > 0:
		if y is None:
		@@ -462,7 +655,8 @@ class NumpyDataset(Dataset):
		def get_shape(self):
		"""Get the shape of the dataset.

		Returns four tuples, giving the shape of the X, y, w, and ids arrays.
		Returns four tuples, giving the shape of the X, y, w, and ids
		arrays.
		"""
		return self._X.shape, self._y.shape, self._w.shape, self._ids.shape

		@@ -499,7 +693,23 @@ class NumpyDataset(Dataset):
		pad_batches=False):
		"""Get an object that iterates over minibatches from the dataset.

		Each minibatch is returned as a tuple of four numpy arrays: (X, y, w, ids).
		Each minibatch is returned as a tuple of four numpy arrays: (X, y,
		w, ids).

		Parameters
		----------
		batch_size: int, optional
		Number of elements in each batch
		epoch: int, optional
		Number of epochs to walk over dataset
		deterministic: bool, optional
		If True, follow deterministic order.
		pad_batches: bool, optional
		If True, pad each batch to `batch_size`.

		Returns
		-------
		Generator which yields tuples of four numpy arrays `(X, y, w, ids)`
		"""

		def iterate(dataset, batch_size, deterministic, pad_batches):
		@@ -551,9 +761,9 @@ class NumpyDataset(Dataset):

		>> newx, newy, neww = fn(x, y, w)

		It might be called only once with the whole dataset, or multiple times with
		different subsets of the data. Each time it is called, it should transform
		the samples and return the transformed data.
		It might be called only once with the whole dataset, or multiple
		times with different subsets of the data. Each time it is called,
		it should transform the samples and return the transformed data.

		Parameters
		----------
		@@ -692,18 +902,17 @@ class DiskDataset(Dataset):
		A Dataset that is stored as a set of files on disk.
		"""

		def __init__(self, data_dir, verbose=True):
		def __init__(self, data_dir):
		"""
		Turns featurized dataframes into numpy files, writes them & metadata to disk.
		"""
		self.data_dir = data_dir
		self.verbose = verbose

		log("Loading dataset from disk.", self.verbose)
		logger.info("Loading dataset from disk.")
		self.tasks, self.metadata_df = self.load_metadata()

		@staticmethod
		def create_dataset(shard_generator, data_dir=None, tasks=[], verbose=True):
		def create_dataset(shard_generator, data_dir=None, tasks=[]):
		"""Creates a new DiskDataset

		Parameters
		@@ -731,8 +940,8 @@ class DiskDataset(Dataset):
		metadata_df = DiskDataset._construct_metadata(metadata_rows)
		save_metadata(tasks, metadata_df, data_dir)
		time2 = time.time()
		log("TIMING: dataset construction took %0.3f s" % (time2 - time1), verbose)
		return DiskDataset(data_dir, verbose=verbose)
		logger.info("TIMING: dataset construction took %0.3f s" % (time2 - time1))
		return DiskDataset(data_dir)

		def load_metadata(self):
		try:
		@@ -1072,7 +1281,7 @@ class DiskDataset(Dataset):

		Example:

		>>> dataset = DiskDataset.from_numpy(np.ones((2,2)), np.ones((2,1)), verbose=False)
		>>> dataset = DiskDataset.from_numpy(np.ones((2,2)), np.ones((2,1)))
		>>> for x, y, w, id in dataset.itersamples():
		... print(x.tolist(), y.tolist(), w.tolist(), id)
		[1.0, 1.0] [1.0] [1.0] 0
		@@ -1123,11 +1332,6 @@ class DiskDataset(Dataset):
		out_dir = tempfile.mkdtemp()
		tasks = self.get_task_names()

		if 'verbose' in args:
		verbose = args['verbose']
		else:
		verbose = True

		def generator():
		for shard_num, row in self.metadata_df.iterrows():
		X, y, w, ids = self.get_shard(shard_num)
		@@ -1135,7 +1339,7 @@ class DiskDataset(Dataset):
		yield (newx, newy, neww, ids)

		return DiskDataset.create_dataset(
		generator(), data_dir=out_dir, tasks=tasks, verbose=verbose)
		generator(), data_dir=out_dir, tasks=tasks)

		def make_pytorch_dataset(self, epochs=1, deterministic=False):
		"""Create a torch.utils.data.IterableDataset that iterates over the data in this Dataset.
		@@ -1179,13 +1383,7 @@ class DiskDataset(Dataset):
		return TorchDataset()

		@staticmethod
		def from_numpy(X,
		y=None,
		w=None,
		ids=None,
		tasks=None,
		data_dir=None,
		verbose=True):
		def from_numpy(X, y=None, w=None, ids=None, tasks=None, data_dir=None):
		"""Creates a DiskDataset object from specified Numpy arrays."""
		n_samples = len(X)
		if ids is None:
		@@ -1218,7 +1416,7 @@ class DiskDataset(Dataset):

		# raw_data = (X, y, w, ids)
		return DiskDataset.create_dataset(
		[(X, y, w, ids)], data_dir=data_dir, tasks=tasks, verbose=verbose)
		[(X, y, w, ids)], data_dir=data_dir, tasks=tasks)

		@staticmethod
		def merge(datasets, merge_dir=None):
		@@ -1306,7 +1504,7 @@ class DiskDataset(Dataset):
		X_s = densify_features(X_sparse_s, num_features)
		self.set_shard(i, X_s, y_s, w_s, ids_s)
		time2 = time.time()
		log("TIMING: sparse_shuffle took %0.3f s" % (time2 - time1), self.verbose)
		logger.info("TIMING: sparse_shuffle took %0.3f s" % (time2 - time1))

		def complete_shuffle(self, data_dir=None):
		"""
		@@ -1323,7 +1521,7 @@ class DiskDataset(Dataset):

		Returns
		-------
		DiskDatasset
		DiskDataset
		A DiskDataset with a single shard.

		"""
		@@ -1425,10 +1623,11 @@ class DiskDataset(Dataset):

		Parameters
		----------
		select_dir: string
		Path to new directory that the selected indices will be copied to.
		indices: list
		List of indices to select.
		select_dir: string
		Path to new directory that the selected indices will be copied
		to.
		"""
		if select_dir is not None:
		if not os.path.exists(select_dir):
		@@ -1437,8 +1636,7 @@ class DiskDataset(Dataset):
		select_dir = tempfile.mkdtemp()
		# Handle edge case with empty indices
		if not len(indices):
		return DiskDataset.create_dataset(
		[], data_dir=select_dir, verbose=self.verbose)
		return DiskDataset.create_dataset([], data_dir=select_dir)
		indices = np.array(sorted(indices)).astype(int)
		tasks = self.get_task_names()

		@@ -1475,7 +1673,7 @@ class DiskDataset(Dataset):
		return

		return DiskDataset.create_dataset(
		generator(), data_dir=select_dir, tasks=tasks, verbose=self.verbose)
		generator(), data_dir=select_dir, tasks=tasks)

		@property
		def ids(self):
		@@ -1751,15 +1949,13 @@ class ImageDataset(Dataset):
		def select(self, indices, select_dir=None):
		"""Creates a new dataset from a selection of indices from self.

		TODO(rbharath): select_dir is here due to dc.splits always passing in
		splits.

		Parameters
		----------
		indices: list
		List of indices to select.
		select_dir: string
		Ignored.
		Used to provide same API as `DiskDataset`. Ignored since
		`NumpYDataset` is purely in-memory.
		"""
		if isinstance(self._X, np.ndarray):
		X = self._X[indices]
		@@ -1776,16 +1972,21 @@ class ImageDataset(Dataset):
		def make_pytorch_dataset(self, epochs=1, deterministic=False):
		"""Create a torch.utils.data.IterableDataset that iterates over the data in this Dataset.

		Each value returned by the Dataset's iterator is a tuple of (X, y, w, id) for
		one sample.
		Each value returned by the Dataset's iterator is a tuple of (X, y,
		w, id) for one sample.

		Parameters
		----------
		epochs: int
		the number of times to iterate over the Dataset
		deterministic: bool
		if True, the data is produced in order. If False, a different random
		permutation of the data is used for each epoch.
		if True, the data is produced in order. If False, a different
		random permutation of the data is used for each epoch.

		Returns
		-------
		`torch.utils.data.IterableDataset` iterating over the same data as
		this dataset.
		"""
		import torch

		@@ -1823,22 +2024,57 @@ class ImageDataset(Dataset):


		class Databag(object):
		"""
		A utility class to iterate through multiple datasets together.
		"""A utility class to iterate through multiple datasets together.


		A `Databag` is useful when you have multiple datasets that you want
		to iterate in locksteps. This might be easiest to grasp with a
		simple code example.

		>>> ones_dataset = NumpyDataset(X=np.ones((5, 3)))
		>>> zeros_dataset = NumpyDataset(X=np.zeros((5, 3)))
		>>> databag = Databag({"ones": ones_dataset, "zeros": zeros_dataset})
		>>> for sample_dict in databag.iterbatches(batch_size=1):
		... print(sample_dict)
		{'ones': array([[1., 1., 1.]]), 'zeros': array([[0., 0., 0.]])}
		{'ones': array([[1., 1., 1.]]), 'zeros': array([[0., 0., 0.]])}
		{'ones': array([[1., 1., 1.]]), 'zeros': array([[0., 0., 0.]])}
		{'ones': array([[1., 1., 1.]]), 'zeros': array([[0., 0., 0.]])}
		{'ones': array([[1., 1., 1.]]), 'zeros': array([[0., 0., 0.]])}

		Note how we get a batch at a time from each of the datasets in the
		`Databag`. This can be useful for training models that combine data
		from multiple `Dataset` objects at a time.
		"""

		def __init__(self, datasets=None):
		"""Initialize this `Databag`.

		Parameters
		----------
		datasets: dict, optional
		A dictionary mapping keys to `Dataset` objects.
		"""
		if datasets is None:
		self.datasets = dict()
		else:
		self.datasets = datasets

		def add_dataset(self, key, dataset):
		"""Adds a dataset to this databag.

		Parameters
		----------
		key: hashable value
		Key to be added
		dataset: `Dataset`
		The dataset that `key` should point to.
		"""
		self.datasets[key] = dataset

		def iterbatches(self, **kwargs):
		"""
		Loop through all internal datasets in the same order
		"""Loop through all internal datasets in the same order.

		Parameters
		----------
		batch_size: int
		@@ -1851,7 +2087,6 @@ class Databag(object):
		Returns
		-------
		Generator which yields a dictionary {key: dataset.X[batch]}

		"""
		key_order = [x for x in self.datasets.keys()]
		if "epochs" in kwargs:

deepchem/utils/init.py

+36 −3

Original line number	Diff line number	Diff line
		@@ -62,6 +62,39 @@ def get_data_dir():
		return tempfile.gettempdir()


		# The number of elements to print for dataset ids/tasks
		_print_threshold = 10


		def get_print_threshold():
		"""Return the printing threshold for array.

		The print threshold is the number of elements from ids/tasks to
		print when printing representations of `Dataset` objects.

		Returns
		----------
		threshold: int
		Number of elements that will be printed
		"""
		return _print_threshold


		def set_print_threshold(threshold):
		"""Set print threshold

		The print threshold is the number of elements from ids/tasks to
		print when printing representations of `Dataset` objects.

		Parameters
		----------
		threshold: int
		Number of elements to print.
		"""
		global _print_threshold
		_print_threshold = threshold


		def download_url(url, dest_dir=get_data_dir(), name=None):
		"""Download a file to disk.

		@@ -140,9 +173,9 @@ class ScaffoldGenerator(object):
		"""
		Get Murcko scaffolds for molecules.

		Murcko scaffolds are described in DOI: 10.1021/jm9602928. They are
		essentially that part of the molecule consisting of rings and the
		linker atoms between them.
		Murcko scaffolds are described in DOI: 10.1021/jm9602928.
		They are essentially that part of the molecule consisting of
		rings and the linker atoms between them.

		Parameters
		----------

Admin message