Changes (354d66a2) · Commits · 钟慕尧 / deepchem

deepchem/splits/splitters.py

+328 −95

Original line number	Diff line number	Diff line
		@@ -24,7 +24,12 @@ logger = logging.getLogger(__name__)


		def generate_scaffold(smiles, include_chirality=False):
		"""Compute the Bemis-Murcko scaffold for a SMILES string."""
		"""Compute the Bemis-Murcko scaffold for a SMILES string.

		Note
		----
		This function requires `rdkit` to be installed.
		"""
		from rdkit import Chem
		mol = Chem.MolFromSmiles(smiles)
		engine = ScaffoldGenerator(include_chirality=include_chirality)
		@@ -43,36 +48,34 @@ def randomize_arrays(array_list):


		class Splitter(object):
		"""
		Abstract base class for chemically aware splits..
		"""Splitters split up Datasets into pieces for training/validation/testing.

		In machine learning applications, it's often necessary to split up a dataset
		into training/validation/test sets. Or to k-fold split a dataset (that is,
		divide into k equal subsets) for cross-validation. The `Splitter` class is
		an abstract superclass for all splitters that captures the common API across
		splitter classes.

		Note that `Splitter` is an abstract superclass. You won't want to
		instantiate this class directly. Rather you will want to use a concrete
		subclass for your application.
		"""

		def k_fold_split(self, dataset, k, directories=None, **kwargs):
		"""
		Parameters
		----------
		dataset: Dataset
		dataset: `dc.data.Dataset`
		Dataset to do a k-fold split

		k: int
		number of folds

		directories: list of str
		Number of folds to split `dataset` into.
		directories: list[str]
		list of length 2*k filepaths to save the result disk-datasets

		kwargs

		Returns
		-------
		list of length k tuples of (train, cv)

		"""
		"""
		:param dataset:
		:param k:
		:param directories:
		:param kwargs:
		:return: list of length k tuples of (train, cv)
		list of length k tuples of (train, cv) where `train` and `cv` are both
		lists of `Dataset`s.
		"""
		logger.info("Computing K-fold split")
		if directories is None:
		@@ -127,7 +130,43 @@ class Splitter(object):
		**kwargs):
		""" Splits self into train/validation/test sets.

		Returns Dataset objects.
		Returns Dataset objects for train, valid, test.

		Parameters
		----------
		dataset: data like object.
		Dataset to be split. This should either be of type
		`dc.data.Dataset` or a type that `dc.utils.data.datasetify` can
		convert into a `Dataset`.
		train_dir: str, optional
		If specified, the directory in which the generated
		training dataset should be stored. This is only
		considered if `isinstance(dataset, dc.data.DiskDataset)`
		valid_dir: str, optional
		If specified, the directory in which the generated
		valid dataset should be stored. This is only
		considered if `isinstance(dataset, dc.data.DiskDataset)`
		is True.
		test_dir: str, optional
		If specified, the directory in which the generated
		test dataset should be stored. This is only
		considered if `isinstance(dataset, dc.data.DiskDataset)`
		is True.
		frac_train: float, optional (default 0.8)
		The fraction of data to be used for the training split.
		frac_valid: float, optional (default 0.1)
		The fraction of data to be used for the validation split.
		frac_test: float, optional (default 0.1)
		The fraction of data to be used for the test split.
		seed: int, optional (default None)
		Random seed to use.
		log_every_n: int, optional
		Controls the logger by dictating how often logger outputs
		will be produced.

		Returns
		-------
		Train and test datasets as dc.data.Dataset objects.
		"""
		logger.info("Computing train/valid/test indices")
		train_inds, valid_inds, test_inds = self.split(
		@@ -163,7 +202,33 @@ class Splitter(object):
		frac_train=.8,
		**kwargs):
		"""Splits self into train/test sets.
		Returns Dataset objects.

		Returns Dataset objects for train/test.

		Parameters
		----------
		dataset: data like object
		Dataset to be split. This should either be of type
		`dc.data.Dataset` or a type that `dc.utils.data.datasetify` can
		convert into a `Dataset`.
		train_dir: str, optional
		If specified, the directory in which the generated
		training dataset should be stored. This is only
		considered if `isinstance(dataset, dc.data.DiskDataset)`
		is True.
		test_dir: str, optional
		If specified, the directory in which the generated
		test dataset should be stored. This is only
		considered if `isinstance(dataset, dc.data.DiskDataset)`
		is True.
		seed: int, optional (default None)
		Random seed to use.
		frac_train: float, optional (default 0.8)
		The fraction of data to be used for the training split.

		Returns
		-------
		Train and test datasets as dc.data.Dataset objects.
		"""
		valid_dir = tempfile.mkdtemp()
		train_dataset, _, test_dataset = self.train_valid_test_split(
		@@ -186,24 +251,48 @@ class Splitter(object):
		frac_test=None,
		log_every_n=None,
		**kwargs):
		"""
		Stub to be filled in by child classes.
		"""Return indices for specified split

		Parameters
		----------
		dataset: dc.data.Dataset
		Dataset to be split
		seed: int, optional (default None)
		Random seed to use.
		frac_train: float, optional (default 0.8)
		The fraction of data to be used for the training split.
		frac_valid: float, optional (default 0.1)
		The fraction of data to be used for the validation split.
		frac_test: float, optional (default 0.1)
		The fraction of data to be used for the test split.
		log_every_n: int, optional
		Controls the logger by dictating how often logger outputs
		will be produced.

		Returns
		-------
		A tuple `(train_inds, valid_inds, test_inds` of the indices (integers) for
		the various splits.
		"""
		raise NotImplementedError


		class RandomGroupSplitter(Splitter):
		"""Random split based on groupings.

		def __init__(self, groups, args, *kwargs):
		A splitter class that splits on groupings. An example use case is when
		there are multiple conformations of the same molecule that share the same
		topology. This splitter subsequently guarantees that resulting splits
		preserve groupings.

		Note that it doesn't do any dynamic programming or something fancy to try
		to maximize the choice such that frac_train, frac_valid, or frac_test is
		maximized. It simply permutes the groups themselves. As such, use with
		caution if the number of elements per group varies significantly.
		"""
		A splitter class that splits on groupings. An example use case is when there
		are multiple conformations of the same molecule that share the same topology.
		This splitter subsequently guarantees that resulting splits preserve groupings.

		Note that it doesn't do any dynamic programming or something fancy to try to
		maximize the choice such that frac_train, frac_valid, or frac_test is maximized.
		It simply permutes the groups themselves. As such, use with caution if the number
		of elements per group varies significantly.
		def __init__(self, groups, args, *kwargs):
		"""Initialize this object.

		Parameters
		----------
		@@ -229,6 +318,29 @@ class RandomGroupSplitter(Splitter):
		frac_valid=.1,
		frac_test=.1,
		log_every_n=None):
		"""Return indices for specified split

		Parameters
		----------
		dataset: dc.data.Dataset
		Dataset to be split
		seed: int, optional (default None)
		Random seed to use.
		frac_train: float, optional (default 0.8)
		The fraction of data to be used for the training split.
		frac_valid: float, optional (default 0.1)
		The fraction of data to be used for the validation split.
		frac_test: float, optional (default 0.1)
		The fraction of data to be used for the test split.
		log_every_n: int, optional
		Controls the logger by dictating how often logger outputs
		will be produced.

		Returns
		-------
		A tuple `(train_inds, valid_inds, test_inds` of the indices (integers) for
		the various splits.
		"""

		assert len(self.groups) == dataset.X.shape[0]
		np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
		@@ -267,8 +379,7 @@ class RandomGroupSplitter(Splitter):


		class RandomStratifiedSplitter(Splitter):
		"""
		RandomStratified Splitter class.
		"""RandomStratified Splitter class.

		For sparse multitask datasets, a standard split offers no guarantees
		that the splits will have any activate compounds. This class guarantees
		@@ -368,7 +479,47 @@ class RandomStratifiedSplitter(Splitter):
		frac_test=.1,
		seed=None,
		log_every_n=1000):
		"""Custom split due to raggedness in original split.
		""" Splits self into train/validation/test sets.

		Most splitters use the superclass implementation
		`Splitter.train_valid_test_split` but this class has to override the
		implementation to deal with potentially ragged splits.

		Parameters
		----------
		dataset: data like object.
		Dataset to be split. This should either be of type
		`dc.data.Dataset` or a type that `dc.utils.data.datasetify` can
		convert into a `Dataset`.
		train_dir: str, optional
		If specified, the directory in which the generated
		training dataset should be stored. This is only
		considered if `isinstance(dataset, dc.data.DiskDataset)`
		valid_dir: str, optional
		If specified, the directory in which the generated
		valid dataset should be stored. This is only
		considered if `isinstance(dataset, dc.data.DiskDataset)`
		is True.
		test_dir: str, optional
		If specified, the directory in which the generated
		test dataset should be stored. This is only
		considered if `isinstance(dataset, dc.data.DiskDataset)`
		is True.
		frac_train: float, optional (default 0.8)
		The fraction of data to be used for the training split.
		frac_valid: float, optional (default 0.1)
		The fraction of data to be used for the validation split.
		frac_test: float, optional (default 0.1)
		The fraction of data to be used for the test split.
		seed: int, optional (default None)
		Random seed to use.
		log_every_n: int, optional
		Controls the logger by dictating how often logger outputs
		will be produced.

		Returns
		-------
		Train and test datasets as dc.data.Dataset objects.
		"""
		if train_dir is None:
		train_dir = tempfile.mkdtemp()
		@@ -414,10 +565,10 @@ class RandomStratifiedSplitter(Splitter):


		class SingletaskStratifiedSplitter(Splitter):
		"""
		Class for doing data splits by stratification on a single task.
		"""Class for doing data splits by stratification on a single task.

		Example:
		Example
		-------

		>>> n_samples = 100
		>>> n_features = 10
		@@ -556,6 +707,10 @@ class SingletaskStratifiedSplitter(Splitter):
		class MolecularWeightSplitter(Splitter):
		"""
		Class for doing data splits by molecular weight.

		Note
		----
		This class requires `rdkit` to be installed.
		"""

		def split(self,
		@@ -565,9 +720,31 @@ class MolecularWeightSplitter(Splitter):
		frac_valid=.1,
		frac_test=.1,
		log_every_n=None):
		"""
		Splits internal compounds into train/validation/test using the MW calculated
		by SMILES string.
		"""Splits on molecular weight.

		Splits internal compounds into train/validation/test using the MW
		calculated by SMILES string.

		Parameters
		----------
		dataset: dc.data.Dataset
		Dataset to be split
		seed: int, optional (default None)
		Random seed to use.
		frac_train: float, optional (default 0.8)
		The fraction of data to be used for the training split.
		frac_valid: float, optional (default 0.1)
		The fraction of data to be used for the validation split.
		frac_test: float, optional (default 0.1)
		The fraction of data to be used for the test split.
		log_every_n: int, optional
		Controls the logger by dictating how often logger outputs
		will be produced.

		Returns
		-------
		A tuple `(train_inds, valid_inds, test_inds` of the indices (integers) for
		the various splits.
		"""

		np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
		@@ -593,11 +770,16 @@ class MolecularWeightSplitter(Splitter):


		class MaxMinSplitter(Splitter):
		"""
		"""Chemical diversity splitter.

		Class for doing splits based on the MaxMin diversity algorithm. Intuitively,
		the test set is comprised of the most diverse compounds of the entire dataset.
		Furthermore, the validation set is comprised of diverse compounds under
		the test set.

		Note
		----
		This class requires `rdkit` to be installed.
		"""

		def split(self,
		@@ -667,8 +849,7 @@ class MaxMinSplitter(Splitter):


		class RandomSplitter(Splitter):
		"""
		Class for doing random data splits.
		"""Class for doing random data splits.
		"""

		def split(self,
		@@ -680,6 +861,27 @@ class RandomSplitter(Splitter):
		log_every_n=None):
		"""
		Splits internal compounds randomly into train/validation/test.

		Parameters
		----------
		dataset: dc.data.Dataset
		Dataset to be split
		seed: int, optional (default None)
		Random seed to use.
		frac_train: float, optional (default 0.8)
		The fraction of data to be used for the training split.
		frac_valid: float, optional (default 0.1)
		The fraction of data to be used for the validation split.
		frac_test: float, optional (default 0.1)
		The fraction of data to be used for the test split.
		log_every_n: int, optional
		Controls the logger by dictating how often logger outputs
		will be produced.

		Returns
		-------
		A tuple `(train_inds, valid_inds, test_inds` of the indices (integers) for
		the various splits.
		"""
		np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
		if not seed is None:
		@@ -693,8 +895,13 @@ class RandomSplitter(Splitter):


		class IndexSplitter(Splitter):
		"""
		Class for simple order based splits.
		"""Class for simple order based splits.

		Use this class when the `Dataset` you have is already ordered sa you would
		like it to be processed. Then the first `frac_train` proportion is used for
		training, the next `frac_valid` for validation, and the final `frac_test` for
		testing. This class may make sense to use your `Dataset` is already time
		ordered (for example).
		"""

		def split(self,
		@@ -704,8 +911,28 @@ class IndexSplitter(Splitter):
		frac_valid=.1,
		frac_test=.1,
		log_every_n=None):
		"""
		Splits internal compounds into train/validation/test in provided order.
		"""Splits internal compounds into train/validation/test in provided order.

		Parameters
		----------
		dataset: dc.data.Dataset
		Dataset to be split
		seed: int, optional (default None)
		Random seed to use.
		frac_train: float, optional (default 0.8)
		The fraction of data to be used for the training split.
		frac_valid: float, optional (default 0.1)
		The fraction of data to be used for the validation split.
		frac_test: float, optional (default 0.1)
		The fraction of data to be used for the test split.
		log_every_n: int, optional
		Controls the logger by dictating how often logger outputs
		will be produced.

		Returns
		-------
		A tuple `(train_inds, valid_inds, test_inds` of the indices (integers) for
		the various splits.
		"""
		np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
		num_datapoints = len(dataset)
		@@ -717,8 +944,14 @@ class IndexSplitter(Splitter):


		class IndiceSplitter(Splitter):
		"""
		Class for splits based on input order.
		"""Split data in the fasion specified by user.

		For some applications, you will already know how you'd like to split the
		dataset. In this splitter, you simplify specify `valid_indices` and
		`test_indices` and the datapoints at those indices are pulled out of the
		dataset. Note that this is different from `IndexSplitter` which only splits
		based on the existing dataset orderning, while this `IndiceSplitter` can
		split on any specified ordering.
		"""

		def __init__(self, valid_indices=None, test_indices=None):

docs/index.rst

+1 −0

Original line number	Diff line number	Diff line
		@@ -124,6 +124,7 @@ discussions about research, development or any general questions. If you'd like
		:name: mastertoc

		Introduction <index>
		Tutorial <tutorial>
		Installation <installation>
		Datasets <datasets>
		Data Loaders <dataloaders>

docs/tutorial.rst

0 → 100644

+84 −0

Original line number	Diff line number	Diff line
		DeepChem Tutorial
		=================

		If you're new to DeepChem, you probably want to know the basics. What is DeepChem? Why should you care about using it? The short answer is that DeepChem is a scientific machine learning library. (The "Chem" indicates the historical fact that DeepChem initially focused on chemical applications, but we aim to support all types of scientific applications more broadly).

		Why would you want to use DeepChem instead of another machine learning
		library? Simply put, DeepChem maintains an extensive collection of utilities
		to enable scientific deep learning including classes for loading scientific
		datasets, processing them, transforming them, splitting them up, and learning
		from them. Behind the scenes DeepChem uses a variety of other machine
		learning frameworks such as `sklearn`_, `tensorflow`_, and `xgboost`_. We are
		also experimenting with adding additional models implemented in `pytorch`_
		and `jax`_. Our focus is to facilitate scientific experimentation using
		whatever tools are available at hand.

		In the rest of this tutorials, we'll provide a rapid fire overview of DeepChem's API. DeepChem is a big library so we won't cover everything, but we should give you enough to get started.

		.. _`sklearn`: https://scikit-learn.org/stable/

		.. _`tensorflow`: https://www.tensorflow.org/

		.. _`xgboost`: https://xgboost.readthedocs.io/en/latest/

		.. _`pytorch`: https://pytorch.org/

		.. _`jax`: https://github.com/google/jax


		Quickstart
		----------
		If you're new, you can install DeepChem on a new machine with the following commands

		.. code-block:: bash
		pip install tensorflow
		pip install deepchem-nightly

		DeepChem is under very active development at present, so we recommend using our nightly build until we release a next major release. Note that to use DeepChem for chemistry applications, you will have to also install RDKit using conda.

		.. code-block:: bash
		conda install -y -c rdkit -c conda-forge rdkit


		Datasets
		--------
		The :code:`dc.data` module contains utilities to handle :code:`Dataset`
		objects. These :code:`Dataset` objects are the heart of DeepChem. A
		:code:`Dataset` is an abstraction of a dataset in machine learning. That is,
		a collection of features, labels, weights, alongside associated identifiers.
		Rather than explaining further, we'll just show you.

		.. doctest::

		>>> import deepchem as dc
		>>> import numpy as np
		>>> N_samples = 50
		>>> n_features = 10
		>>> X = np.random.rand(N_samples, n_features)
		>>> y = np.random.rand(N_samples)
		>>> dataset = dc.data.NumpyDataset(X, y)
		>>> dataset.X.shape
		(50, 10)
		>>> dataset.y.shape
		(50,)

		Here we've used the :code:`NumpyDataset` class which stores datasets in memory. This works fine for smaller datasets and is very convenient for experimentation, but is less convenient for larger datasets. For that we have the :code:`DiskDataset` class.

		.. doctest::

		>>> dataset = dc.data.DiskDataset.from_numpy(X, y)
		>>> dataset.X.shape
		(50, 10)
		>>> dataset.y.shape
		(50,)

		In this example we haven't specified a data directory, so this :code:`DiskDataset` is written to a temporary folder. Note that :code:`dataset.X` and :code:`dataset.y` load data from disk underneath the hood! So this can get very expensive for larger datasets.


		More Tutorials
		--------------
		DeepChem maintains an extensive collection of addition `tutorials`_ that are meant to be run on Google `colab`_, an online platform that allows you to execute Jupyter notebooks. Once you've finished this introductory tutorial, we recommend working through these more involved tutorials.

		.. _`tutorials`: https://github.com/deepchem/deepchem/tree/master/examples/tutorials

		.. _`colab`: https://colab.research.google.com/

Admin message