Commit 354d66a2 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Changes

parent 8387b2e7
Loading
Loading
Loading
Loading
+328 −95
Original line number Diff line number Diff line
@@ -24,7 +24,12 @@ logger = logging.getLogger(__name__)


def generate_scaffold(smiles, include_chirality=False):
  """Compute the Bemis-Murcko scaffold for a SMILES string."""
  """Compute the Bemis-Murcko scaffold for a SMILES string.

  Note
  ----
  This function requires `rdkit` to be installed.
  """
  from rdkit import Chem
  mol = Chem.MolFromSmiles(smiles)
  engine = ScaffoldGenerator(include_chirality=include_chirality)
@@ -43,36 +48,34 @@ def randomize_arrays(array_list):


class Splitter(object):
  """
    Abstract base class for chemically aware splits..
  """Splitters split up Datasets into pieces for training/validation/testing.

  In machine learning applications, it's often necessary to split up a dataset
  into training/validation/test sets. Or to k-fold split a dataset (that is,
  divide into k equal subsets) for cross-validation. The `Splitter` class is
  an abstract superclass for all splitters that captures the common API across
  splitter classes.

  Note that `Splitter` is an abstract superclass. You won't want to
  instantiate this class directly. Rather you will want to use a concrete
  subclass for your application.
  """

  def k_fold_split(self, dataset, k, directories=None, **kwargs):
    """
    Parameters
    ----------
    dataset: Dataset
    dataset: `dc.data.Dataset`
      Dataset to do a k-fold split

    k: int
    number of folds

    directories: list of str
      Number of folds to split `dataset` into.
    directories: list[str]
      list of length 2*k filepaths to save the result disk-datasets

    kwargs

    Returns
    -------
    list of length k tuples of (train, cv)

    """
    """
    :param dataset:
    :param k:
    :param directories:
    :param kwargs:
    :return: list of length k tuples of (train, cv)
    list of length k tuples of (train, cv) where `train` and `cv` are both
    lists of `Dataset`s.
    """
    logger.info("Computing K-fold split")
    if directories is None:
@@ -127,7 +130,43 @@ class Splitter(object):
                             **kwargs):
    """ Splits self into train/validation/test sets.

    Returns Dataset objects.
    Returns Dataset objects for train, valid, test.

    Parameters
    ----------
    dataset: data like object. 
      Dataset to be split. This should either be of type
      `dc.data.Dataset` or a type that `dc.utils.data.datasetify` can
      convert into a `Dataset`.
    train_dir: str, optional
      If specified, the directory in which the generated
      training dataset should be stored. This is only
      considered if `isinstance(dataset, dc.data.DiskDataset)`
    valid_dir: str, optional
      If specified, the directory in which the generated
      valid dataset should be stored. This is only
      considered if `isinstance(dataset, dc.data.DiskDataset)`
      is True.
    test_dir: str, optional
      If specified, the directory in which the generated
      test dataset should be stored. This is only
      considered if `isinstance(dataset, dc.data.DiskDataset)`
      is True.
    frac_train: float, optional (default 0.8)
      The fraction of data to be used for the training split.
    frac_valid: float, optional (default 0.1)
      The fraction of data to be used for the validation split.
    frac_test: float, optional (default 0.1)
      The fraction of data to be used for the test split.
    seed: int, optional (default None)
      Random seed to use.
    log_every_n: int, optional
      Controls the logger by dictating how often logger outputs
      will be produced.

    Returns
    -------
    Train and test datasets as dc.data.Dataset objects.
    """
    logger.info("Computing train/valid/test indices")
    train_inds, valid_inds, test_inds = self.split(
@@ -163,7 +202,33 @@ class Splitter(object):
                       frac_train=.8,
                       **kwargs):
    """Splits self into train/test sets.
    Returns Dataset objects.

    Returns Dataset objects for train/test.

    Parameters
    ----------
    dataset: data like object
      Dataset to be split. This should either be of type
      `dc.data.Dataset` or a type that `dc.utils.data.datasetify` can
      convert into a `Dataset`.
    train_dir: str, optional
      If specified, the directory in which the generated
      training dataset should be stored. This is only
      considered if `isinstance(dataset, dc.data.DiskDataset)`
      is True.
    test_dir: str, optional
      If specified, the directory in which the generated
      test dataset should be stored. This is only
      considered if `isinstance(dataset, dc.data.DiskDataset)`
      is True.
    seed: int, optional (default None)
      Random seed to use.
    frac_train: float, optional (default 0.8)
      The fraction of data to be used for the training split.

    Returns
    -------
    Train and test datasets as dc.data.Dataset objects.
    """
    valid_dir = tempfile.mkdtemp()
    train_dataset, _, test_dataset = self.train_valid_test_split(
@@ -186,24 +251,48 @@ class Splitter(object):
            frac_test=None,
            log_every_n=None,
            **kwargs):
    """
    Stub to be filled in by child classes.
    """Return indices for specified split

    Parameters
    ----------
    dataset: dc.data.Dataset
      Dataset to be split
    seed: int, optional (default None)
      Random seed to use.
    frac_train: float, optional (default 0.8)
      The fraction of data to be used for the training split.
    frac_valid: float, optional (default 0.1)
      The fraction of data to be used for the validation split.
    frac_test: float, optional (default 0.1)
      The fraction of data to be used for the test split.
    log_every_n: int, optional
      Controls the logger by dictating how often logger outputs
      will be produced.

    Returns
    -------
    A tuple `(train_inds, valid_inds, test_inds` of the indices (integers) for
    the various splits.
    """
    raise NotImplementedError


class RandomGroupSplitter(Splitter):
  """Random split based on groupings.

  def __init__(self, groups, *args, **kwargs):
  A splitter class that splits on groupings. An example use case is when
  there are multiple conformations of the same molecule that share the same
  topology.  This splitter subsequently guarantees that resulting splits
  preserve groupings.

  Note that it doesn't do any dynamic programming or something fancy to try
  to maximize the choice such that frac_train, frac_valid, or frac_test is
  maximized.  It simply permutes the groups themselves. As such, use with
  caution if the number of elements per group varies significantly.
  """
    A splitter class that splits on groupings. An example use case is when there
    are multiple conformations of the same molecule that share the same topology.
    This splitter subsequently guarantees that resulting splits preserve groupings.

    Note that it doesn't do any dynamic programming or something fancy to try to
    maximize the choice such that frac_train, frac_valid, or frac_test is maximized.
    It simply permutes the groups themselves. As such, use with caution if the number
    of elements per group varies significantly.
  def __init__(self, groups, *args, **kwargs):
    """Initialize this object.

    Parameters
    ----------
@@ -229,6 +318,29 @@ class RandomGroupSplitter(Splitter):
            frac_valid=.1,
            frac_test=.1,
            log_every_n=None):
    """Return indices for specified split

    Parameters
    ----------
    dataset: dc.data.Dataset
      Dataset to be split
    seed: int, optional (default None)
      Random seed to use.
    frac_train: float, optional (default 0.8)
      The fraction of data to be used for the training split.
    frac_valid: float, optional (default 0.1)
      The fraction of data to be used for the validation split.
    frac_test: float, optional (default 0.1)
      The fraction of data to be used for the test split.
    log_every_n: int, optional
      Controls the logger by dictating how often logger outputs
      will be produced.

    Returns
    -------
    A tuple `(train_inds, valid_inds, test_inds` of the indices (integers) for
    the various splits.
    """

    assert len(self.groups) == dataset.X.shape[0]
    np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
@@ -267,8 +379,7 @@ class RandomGroupSplitter(Splitter):


class RandomStratifiedSplitter(Splitter):
  """
  RandomStratified Splitter class.
  """RandomStratified Splitter class.

  For sparse multitask datasets, a standard split offers no guarantees
  that the splits will have any activate compounds. This class guarantees
@@ -368,7 +479,47 @@ class RandomStratifiedSplitter(Splitter):
                             frac_test=.1,
                             seed=None,
                             log_every_n=1000):
    """Custom split due to raggedness in original split.
    """ Splits self into train/validation/test sets.

    Most splitters use the superclass implementation
    `Splitter.train_valid_test_split` but this class has to override the
    implementation to deal with potentially ragged splits. 

    Parameters
    ----------
    dataset: data like object. 
      Dataset to be split. This should either be of type
      `dc.data.Dataset` or a type that `dc.utils.data.datasetify` can
      convert into a `Dataset`.
    train_dir: str, optional
      If specified, the directory in which the generated
      training dataset should be stored. This is only
      considered if `isinstance(dataset, dc.data.DiskDataset)`
    valid_dir: str, optional
      If specified, the directory in which the generated
      valid dataset should be stored. This is only
      considered if `isinstance(dataset, dc.data.DiskDataset)`
      is True.
    test_dir: str, optional
      If specified, the directory in which the generated
      test dataset should be stored. This is only
      considered if `isinstance(dataset, dc.data.DiskDataset)`
      is True.
    frac_train: float, optional (default 0.8)
      The fraction of data to be used for the training split.
    frac_valid: float, optional (default 0.1)
      The fraction of data to be used for the validation split.
    frac_test: float, optional (default 0.1)
      The fraction of data to be used for the test split.
    seed: int, optional (default None)
      Random seed to use.
    log_every_n: int, optional
      Controls the logger by dictating how often logger outputs
      will be produced.

    Returns
    -------
    Train and test datasets as dc.data.Dataset objects.
    """
    if train_dir is None:
      train_dir = tempfile.mkdtemp()
@@ -414,10 +565,10 @@ class RandomStratifiedSplitter(Splitter):


class SingletaskStratifiedSplitter(Splitter):
  """
    Class for doing data splits by stratification on a single task.
  """Class for doing data splits by stratification on a single task.

    Example:
  Example
  -------

  >>> n_samples = 100
  >>> n_features = 10
@@ -556,6 +707,10 @@ class SingletaskStratifiedSplitter(Splitter):
class MolecularWeightSplitter(Splitter):
  """
  Class for doing data splits by molecular weight.

  Note
  ----
  This class requires `rdkit` to be installed.
  """

  def split(self,
@@ -565,9 +720,31 @@ class MolecularWeightSplitter(Splitter):
            frac_valid=.1,
            frac_test=.1,
            log_every_n=None):
    """
        Splits internal compounds into train/validation/test using the MW calculated
        by SMILES string.
    """Splits on molecular weight.

    Splits internal compounds into train/validation/test using the MW
    calculated by SMILES string.

    Parameters
    ----------
    dataset: dc.data.Dataset
      Dataset to be split
    seed: int, optional (default None)
      Random seed to use.
    frac_train: float, optional (default 0.8)
      The fraction of data to be used for the training split.
    frac_valid: float, optional (default 0.1)
      The fraction of data to be used for the validation split.
    frac_test: float, optional (default 0.1)
      The fraction of data to be used for the test split.
    log_every_n: int, optional
      Controls the logger by dictating how often logger outputs
      will be produced.

    Returns
    -------
    A tuple `(train_inds, valid_inds, test_inds` of the indices (integers) for
    the various splits.
    """

    np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
@@ -593,11 +770,16 @@ class MolecularWeightSplitter(Splitter):


class MaxMinSplitter(Splitter):
  """
  """Chemical diversity splitter.

  Class for doing splits based on the MaxMin diversity algorithm. Intuitively,
  the test set is comprised of the most diverse compounds of the entire dataset.
  Furthermore, the validation set is comprised of diverse compounds under
  the test set.

  Note
  ----
  This class requires `rdkit` to be installed.
  """

  def split(self,
@@ -667,8 +849,7 @@ class MaxMinSplitter(Splitter):


class RandomSplitter(Splitter):
  """
    Class for doing random data splits.
  """Class for doing random data splits.
  """

  def split(self,
@@ -680,6 +861,27 @@ class RandomSplitter(Splitter):
            log_every_n=None):
    """
    Splits internal compounds randomly into train/validation/test.

    Parameters
    ----------
    dataset: dc.data.Dataset
      Dataset to be split
    seed: int, optional (default None)
      Random seed to use.
    frac_train: float, optional (default 0.8)
      The fraction of data to be used for the training split.
    frac_valid: float, optional (default 0.1)
      The fraction of data to be used for the validation split.
    frac_test: float, optional (default 0.1)
      The fraction of data to be used for the test split.
    log_every_n: int, optional
      Controls the logger by dictating how often logger outputs
      will be produced.

    Returns
    -------
    A tuple `(train_inds, valid_inds, test_inds` of the indices (integers) for
    the various splits.
    """
    np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
    if not seed is None:
@@ -693,8 +895,13 @@ class RandomSplitter(Splitter):


class IndexSplitter(Splitter):
  """
    Class for simple order based splits.
  """Class for simple order based splits.

  Use this class when the `Dataset` you have is already ordered sa you would
  like it to be processed. Then the first `frac_train` proportion is used for
  training, the next `frac_valid` for validation, and the final `frac_test` for
  testing. This class may make sense to use your `Dataset` is already time
  ordered (for example).
  """

  def split(self,
@@ -704,8 +911,28 @@ class IndexSplitter(Splitter):
            frac_valid=.1,
            frac_test=.1,
            log_every_n=None):
    """
        Splits internal compounds into train/validation/test in provided order.
    """Splits internal compounds into train/validation/test in provided order.

    Parameters
    ----------
    dataset: dc.data.Dataset
      Dataset to be split
    seed: int, optional (default None)
      Random seed to use.
    frac_train: float, optional (default 0.8)
      The fraction of data to be used for the training split.
    frac_valid: float, optional (default 0.1)
      The fraction of data to be used for the validation split.
    frac_test: float, optional (default 0.1)
      The fraction of data to be used for the test split.
    log_every_n: int, optional
      Controls the logger by dictating how often logger outputs
      will be produced.

    Returns
    -------
    A tuple `(train_inds, valid_inds, test_inds` of the indices (integers) for
    the various splits.
    """
    np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
    num_datapoints = len(dataset)
@@ -717,8 +944,14 @@ class IndexSplitter(Splitter):


class IndiceSplitter(Splitter):
  """
    Class for splits based on input order.
  """Split data in the fasion specified by user.

  For some applications, you will already know how you'd like to split the
  dataset. In this splitter, you simplify specify `valid_indices` and
  `test_indices` and the datapoints at those indices are pulled out of the
  dataset. Note that this is different from `IndexSplitter` which only splits
  based on the existing dataset orderning, while this `IndiceSplitter` can
  split on any specified ordering.
  """

  def __init__(self, valid_indices=None, test_indices=None):
+1 −0
Original line number Diff line number Diff line
@@ -124,6 +124,7 @@ discussions about research, development or any general questions. If you'd like
   :name: mastertoc

   Introduction <index>
   Tutorial <tutorial>
   Installation <installation>
   Datasets <datasets>
   Data Loaders <dataloaders>

docs/tutorial.rst

0 → 100644
+84 −0
Original line number Diff line number Diff line
DeepChem Tutorial
=================

If you're new to DeepChem, you probably want to know the basics. What is DeepChem? Why should you care about using it? The short answer is that DeepChem is a scientific machine learning library. (The "Chem" indicates the historical fact that DeepChem initially focused on chemical applications, but we aim to support all types of scientific applications more broadly).

Why would you want to use DeepChem instead of another machine learning
library? Simply put, DeepChem maintains an extensive collection of utilities
to enable scientific deep learning including classes for loading scientific
datasets, processing them, transforming them, splitting them up, and learning
from them. Behind the scenes DeepChem uses a variety of other machine
learning frameworks such as `sklearn`_, `tensorflow`_, and `xgboost`_. We are
also experimenting with adding additional models implemented in `pytorch`_
and `jax`_. Our focus is to facilitate scientific experimentation using
whatever tools are available at hand.

In the rest of this tutorials, we'll provide a rapid fire overview of DeepChem's API. DeepChem is a big library so we won't cover everything, but we should give you enough to get started.

.. _`sklearn`: https://scikit-learn.org/stable/

.. _`tensorflow`: https://www.tensorflow.org/

.. _`xgboost`: https://xgboost.readthedocs.io/en/latest/

.. _`pytorch`: https://pytorch.org/

.. _`jax`: https://github.com/google/jax


Quickstart
----------
If you're new, you can install DeepChem on a new machine with the following commands

.. code-block:: bash
  pip install tensorflow
  pip install deepchem-nightly

DeepChem is under very active development at present, so we recommend using our nightly build until we release a next major release. Note that to use DeepChem for chemistry applications, you will have to also install RDKit using conda.

.. code-block:: bash
  conda install -y -c rdkit -c conda-forge rdkit


Datasets
--------
The :code:`dc.data` module contains utilities to handle :code:`Dataset`
objects. These :code:`Dataset` objects are the heart of DeepChem. A
:code:`Dataset` is an abstraction of a dataset in machine learning. That is,
a collection of features, labels, weights, alongside associated identifiers.
Rather than explaining further, we'll just show you.

.. doctest:: 

   >>> import deepchem as dc
   >>> import numpy as np
   >>> N_samples = 50
   >>> n_features = 10
   >>> X = np.random.rand(N_samples, n_features)
   >>> y = np.random.rand(N_samples)
   >>> dataset = dc.data.NumpyDataset(X, y) 
   >>> dataset.X.shape
   (50, 10)
   >>> dataset.y.shape
   (50,)

Here we've used the :code:`NumpyDataset` class which stores datasets in memory. This works fine for smaller datasets and is very convenient for experimentation, but is less convenient for larger datasets. For that we have the :code:`DiskDataset` class.

.. doctest::

   >>> dataset = dc.data.DiskDataset.from_numpy(X, y)
   >>> dataset.X.shape
   (50, 10)
   >>> dataset.y.shape
   (50,)

In this example we haven't specified a data directory, so this :code:`DiskDataset` is written to a temporary folder. Note that :code:`dataset.X` and :code:`dataset.y` load data from disk underneath the hood! So this can get very expensive for larger datasets.


More Tutorials
--------------
DeepChem maintains an extensive collection of addition `tutorials`_ that are meant to be run on Google `colab`_, an online platform that allows you to execute Jupyter notebooks. Once you've finished this introductory tutorial, we recommend working through these more involved tutorials.

.. _`tutorials`: https://github.com/deepchem/deepchem/tree/master/examples/tutorials

.. _`colab`: https://colab.research.google.com/