Unverified Commit 0d3f2285 authored by peastman's avatar peastman Committed by GitHub
Browse files

Merge pull request #2200 from peastman/butina

Changes to ButinaSplitter
parents 678c6214 e9dd3bb1
Loading
Loading
Loading
Loading
+39 −34
Original line number Diff line number Diff line
@@ -1015,14 +1015,26 @@ class ButinaSplitter(Splitter):
  This class requires RDKit to be installed.
  """

  def split(self,
  def __init__(self, cutoff: float = 0.6):
    """Create a ButinaSplitter.

    Parameters
    ----------
    cutoff: float (default 0.6)
      The cutoff value for tanimoto similarity.  Molecules that are more similar
      than this will tend to be put in the same dataset.
    """
    super(ButinaSplitter, self).__init__()
    self.cutoff = cutoff

  def split(
      self,
      dataset: Dataset,
      frac_train: float = 0.8,
      frac_valid: float = 0.1,
      frac_test: float = 0.1,
      seed: Optional[int] = None,
            log_every_n: Optional[int] = None,
            cutoff: float = 0.18) -> Tuple[List[int], List[int], List]:
      log_every_n: Optional[int] = None) -> Tuple[List[int], List[int], List]:
    """
    Splits internal compounds into train and validation based on the butina
    clustering algorithm. This splitting algorithm has an O(N^2) run time, where N
@@ -1047,19 +1059,12 @@ class ButinaSplitter(Splitter):
      Random seed to use.
    log_every_n: int, optional (default None)
      Log every n examples (not currently used).
    cutoff: float, optional (default 0.18)
      The cutoff value for similarity.

    Returns
    -------
    Tuple[List[int], List[int], List[int]]
      A tuple of train indices, valid indices, and test indices.
      Each indices is a list of integers and test indices is always an empty list.

    Notes
    -----
    This function entirely disregards the ratios for frac_train, frac_valid,
    and frac_test. Furthermore, it does not generate a test set, only a train and valid set.
    """
    try:
      from rdkit import Chem, DataStructs
@@ -1068,7 +1073,7 @@ class ButinaSplitter(Splitter):
    except ModuleNotFoundError:
      raise ValueError("This function requires RDKit to be installed.")

    logger.info("Performing butina clustering with cutoff of", cutoff)
    logger.info("Performing butina clustering with cutoff of", self.cutoff)
    mols = []
    for ind, smiles in enumerate(dataset.ids):
      mols.append(Chem.MolFromSmiles(smiles))
@@ -1081,26 +1086,26 @@ class ButinaSplitter(Splitter):
    for i in range(1, nfps):
      sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
      dists.extend([1 - x for x in sims])
    scaffold_sets = Butina.ClusterData(dists, nfps, cutoff, isDistData=True)
    scaffold_sets = Butina.ClusterData(
        dists, nfps, self.cutoff, isDistData=True)
    scaffold_sets = sorted(scaffold_sets, key=lambda x: -len(x))

    ys = dataset.y
    valid_inds = []
    for c_idx, cluster in enumerate(scaffold_sets):
      # for m_idx in cluster:
      valid_inds.extend(cluster)
      # continue until we find an active in all the tasks, otherwise we can't
      # compute a meaningful AUC
      # TODO (ytz): really, we want at least one active and inactive in both scenarios.
      # TODO (Ytz): for regression tasks we'd stop after only one cluster.
      active_populations = np.sum(ys[valid_inds], axis=0)
      if np.all(active_populations):
        logger.info("# of actives per task in valid:", active_populations)
        logger.info("Total # of validation points:", len(valid_inds))
        break

    train_inds = list(itertools.chain.from_iterable(scaffold_sets[c_idx + 1:]))
    return train_inds, valid_inds, []
    train_cutoff = frac_train * len(dataset)
    valid_cutoff = (frac_train + frac_valid) * len(dataset)
    train_inds: List[int] = []
    valid_inds: List[int] = []
    test_inds: List[int] = []

    logger.info("About to sort in scaffold sets")
    for scaffold_set in scaffold_sets:
      if len(train_inds) + len(scaffold_set) > train_cutoff:
        if len(train_inds) + len(valid_inds) + len(scaffold_set) > valid_cutoff:
          test_inds += scaffold_set
        else:
          valid_inds += scaffold_set
      else:
        train_inds += scaffold_set
    return train_inds, valid_inds, test_inds


def _generate_scaffold(smiles: str, include_chirality: bool = False) -> str:
+3 −3
Original line number Diff line number Diff line
@@ -204,9 +204,9 @@ class TestSplitter(unittest.TestCase):
    train_data, valid_data, test_data = \
      butina_splitter.train_valid_test_split(
        solubility_dataset)
    assert len(train_data) == 7
    assert len(valid_data) == 3
    assert len(test_data) == 0
    assert len(train_data) == 8
    assert len(valid_data) == 1
    assert len(test_data) == 1

  def test_k_fold_splitter(self):
    """
+15 −7
Original line number Diff line number Diff line
%% Cell type:markdown id: tags:

# Tutorial Part 8: Working With Splitters

When using machine learning, you typically divide your data into training, validation, and test sets.  The MoleculeNet loaders do this automatically.  But how should you divide up the data?  This question seems simple at first, but it turns out to be quite complicated.  There are many ways of splitting up data, and which one you choose can have a big impact on the reliability of your results.  This tutorial introduces some of the splitting methods provided by DeepChem.

## Colab

This tutorial and the rest in this sequence can be done in Google colab. If you'd like to open this notebook in colab, you can use the following link.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepchem/deepchem/blob/master/examples/tutorials/08_Working_With_Splitters.ipynb)

## Setup

To run DeepChem within Colab, you'll need to run the following installation commands. This will take about 5 minutes to run to completion and install your environment. You can of course run this tutorial locally if you prefer. In that case, don't run these cells since they will download and install Anaconda on your local machine.

%% Cell type:code id: tags:

``` python
!curl -Lo conda_installer.py https://raw.githubusercontent.com/deepchem/deepchem/master/scripts/colab_install.py
import conda_installer
conda_installer.install()
!/root/miniconda/bin/conda info -e
```

%% Cell type:code id: tags:

``` python
!pip install --pre deepchem
import deepchem
deepchem.__version__
```

%% Cell type:markdown id: tags:

## Splitters

In DeepChem, a method of splitting samples into multiple datasets is defined by a `Splitter` object.  Choosing an appropriate method for your data is very important.  Otherwise, your trained model may seem to work much better than it really does.

Consider a typical drug development pipeline.  You might begin by screening many thousands of molecules to see if they bind to your target of interest.  Once you find one that seems to work, you try to optimize it by testing thousands of minor variations on it, looking for one that binds more strongly.  Then perhaps you test it in animals and find it has unacceptable toxicity, so you try more variations to fix the problems.

This has an important consequence for chemical datasets: they often include lots of molecules that are very similar to each other.  If you split the data into training and test sets in a naive way, the training set will include many molecules that are very similar to the ones in the test set, even if they are not exactly identical.  As a result, the model may do very well on the test set, but then fail badly when you try to use it on other data that is less similar to the training data.

Let's take a look at a few of the splitters found in DeepChem.

### RandomSplitter

This is one of the simplest splitters.  It just selects samples for the training, validation, and test sets in a completely random way.

Didn't we just say that's a bad idea?  Well, it depends on your data.  If every sample is truly independent of every other, then this is just as good a way as any to split the data.  There is no universally best choice of splitter.  It all depends on your particular dataset, and for some datasets this is a fine choice.

### RandomStratifiedSplitter

Some datasets are very unbalanced: only a tiny fraction of all samples are positive.  In that case, random splitting may sometimes lead to the validation or test set having few or even no positive samples for some tasks.  That makes it unable to evaluate performance.

`RandomStratifiedSplitter` addresses this by dividing up the positive and negative samples evenly.  If you ask for a 80/10/10 split, the validation and test sets will contain not just 10% of samples, but also 10% of the positive samples for each task.

### ScaffoldSplitter

This splitter tries to address the problem discussed above where many molecules are very similar to each other.  It identifies the scaffold that forms the core of each molecule, and ensures that all molecules with the same scaffold are put into the same dataset.  This is still not a perfect solution, since two molecules may have different scaffolds but be very similar in other ways, but it usually is a large improvement over random splitting.

### ButinaSplitter

This is another splitter that tries to address the problem of similar molecules.  It clusters them based on their molecular fingerprints, so that ones with similar fingerprints will tend to be in the same dataset.  The time required by this splitting algorithm scales as the square of the number of molecules, so it is mainly useful for small to medium sized datasets.

### SpecifiedSplitter

This splitter leaves everything up to the user.  You tell it exactly which samples to put in each dataset.  This is useful when you know in advance that a particular splitting is appropriate for your data.

An example is temporal splitting.  Consider a research project where you are continually generating and testing new molecules.  As you gain more data, you periodically retrain your model on the steadily growing dataset, then use it to predict results for other not yet tested molecules.  A good way of validating whether this works is to pick a particular cutoff date, train the model on all data you had at that time, and see how well it predicts other data that was generated later.

## Effect of Using Different Splitters

Let's look at an example.  We will load the Tox21 toxicity dataset using both random and scaffold splitting.  For each one we train a model and evaluate it on the training and test sets.
Let's look at an example.  We will load the Tox21 toxicity dataset using random, scaffold, and Butina splitting.  For each one we train a model and evaluate it on the training and test sets.

%% Cell type:code id: tags:

``` python
import deepchem as dc

splitters = ['random', 'scaffold']
splitters = ['random', 'scaffold', 'butina']
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
for splitter in splitters:
    tasks, datasets, transformers = dc.molnet.load_tox21(featurizer='ECFP', split=splitter)
    train_dataset, valid_dataset, test_dataset = datasets
    model = dc.models.MultitaskClassifier(n_tasks=len(tasks), n_features=1024, layer_sizes=[1000])
    model.fit(train_dataset, nb_epoch=10)
    print('splitter:', splitter)
    print('training set score:', model.evaluate(train_dataset, [metric], transformers))
    print('test set score:', model.evaluate(test_dataset, [metric], transformers))
    print()
```

%% Output

    splitter: random
    training set score: {'roc_auc_score': 0.955262277942416}
    test set score: {'roc_auc_score': 0.7822195797170739}
    training set score: {'roc_auc_score': 0.9560766203173238}
    test set score: {'roc_auc_score': 0.8088861019955839}
    
    splitter: scaffold
    training set score: {'roc_auc_score': 0.9589920031585532}
    test set score: {'roc_auc_score': 0.6864850510346351}
    training set score: {'roc_auc_score': 0.9582835670901536}
    test set score: {'roc_auc_score': 0.6803307954037949}
    
    splitter: butina
    training set score: {'roc_auc_score': 0.9578120869103354}
    test set score: {'roc_auc_score': 0.6057007877463954}
    

%% Cell type:markdown id: tags:

Both of them produce very similar performance on the training set, but the random splitter has much higher performance on the test set.  Does that mean random splitting is better?  No!  It means random splitting doesn't give you an accurate measure of how well your model works.  Because the test set contains lots of molecules that are very similar to ones in the training set, it isn't truly independent.  It makes the model appear to work better than it really does.  Scaffold splitting gives a better indication of what you can expect on independent data in the future.
All of them produce very similar performance on the training set, but the random splitter has much higher performance on the test set.  Scaffold splitting has a lower test set score, and Butina splitting is even lower.  Does that mean random splitting is better?  No!  It means random splitting doesn't give you an accurate measure of how well your model works.  Because the test set contains lots of molecules that are very similar to ones in the training set, it isn't truly independent.  It makes the model appear to work better than it really does.  Scaffold splitting and Butina splitting give a better indication of what you can expect on independent data in the future.

%% Cell type:markdown id: tags:

# Congratulations! Time to join the Community!

Congratulations on completing this tutorial notebook! If you enjoyed working through the tutorial, and want to continue working with DeepChem, we encourage you to finish the rest of the tutorials in this series. You can also help the DeepChem community in the following ways:

## Star DeepChem on [GitHub](https://github.com/deepchem/deepchem)
This helps build awareness of the DeepChem project and the tools for open source drug discovery that we're trying to build.

## Join the DeepChem Gitter
The DeepChem [Gitter](https://gitter.im/deepchem/Lobby) hosts a number of scientists, developers, and enthusiasts interested in deep learning for the life sciences. Join the conversation!