Merge pull request #2200 from peastman/butina (0d3f2285) · Commits · 钟慕尧 / deepchem

deepchem/splits/splitters.py

+39 −34

Original line number	Diff line number	Diff line
		@@ -1015,14 +1015,26 @@ class ButinaSplitter(Splitter):
		This class requires RDKit to be installed.
		"""

		def split(self,
		def __init__(self, cutoff: float = 0.6):
		"""Create a ButinaSplitter.

		Parameters
		----------
		cutoff: float (default 0.6)
		The cutoff value for tanimoto similarity. Molecules that are more similar
		than this will tend to be put in the same dataset.
		"""
		super(ButinaSplitter, self).__init__()
		self.cutoff = cutoff

		def split(
		self,
		dataset: Dataset,
		frac_train: float = 0.8,
		frac_valid: float = 0.1,
		frac_test: float = 0.1,
		seed: Optional[int] = None,
		log_every_n: Optional[int] = None,
		cutoff: float = 0.18) -> Tuple[List[int], List[int], List]:
		log_every_n: Optional[int] = None) -> Tuple[List[int], List[int], List]:
		"""
		Splits internal compounds into train and validation based on the butina
		clustering algorithm. This splitting algorithm has an O(N^2) run time, where N
		@@ -1047,19 +1059,12 @@ class ButinaSplitter(Splitter):
		Random seed to use.
		log_every_n: int, optional (default None)
		Log every n examples (not currently used).
		cutoff: float, optional (default 0.18)
		The cutoff value for similarity.

		Returns
		-------
		Tuple[List[int], List[int], List[int]]
		A tuple of train indices, valid indices, and test indices.
		Each indices is a list of integers and test indices is always an empty list.

		Notes
		-----
		This function entirely disregards the ratios for frac_train, frac_valid,
		and frac_test. Furthermore, it does not generate a test set, only a train and valid set.
		"""
		try:
		from rdkit import Chem, DataStructs
		@@ -1068,7 +1073,7 @@ class ButinaSplitter(Splitter):
		except ModuleNotFoundError:
		raise ValueError("This function requires RDKit to be installed.")

		logger.info("Performing butina clustering with cutoff of", cutoff)
		logger.info("Performing butina clustering with cutoff of", self.cutoff)
		mols = []
		for ind, smiles in enumerate(dataset.ids):
		mols.append(Chem.MolFromSmiles(smiles))
		@@ -1081,26 +1086,26 @@ class ButinaSplitter(Splitter):
		for i in range(1, nfps):
		sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
		dists.extend([1 - x for x in sims])
		scaffold_sets = Butina.ClusterData(dists, nfps, cutoff, isDistData=True)
		scaffold_sets = Butina.ClusterData(
		dists, nfps, self.cutoff, isDistData=True)
		scaffold_sets = sorted(scaffold_sets, key=lambda x: -len(x))

		ys = dataset.y
		valid_inds = []
		for c_idx, cluster in enumerate(scaffold_sets):
		# for m_idx in cluster:
		valid_inds.extend(cluster)
		# continue until we find an active in all the tasks, otherwise we can't
		# compute a meaningful AUC
		# TODO (ytz): really, we want at least one active and inactive in both scenarios.
		# TODO (Ytz): for regression tasks we'd stop after only one cluster.
		active_populations = np.sum(ys[valid_inds], axis=0)
		if np.all(active_populations):
		logger.info("# of actives per task in valid:", active_populations)
		logger.info("Total # of validation points:", len(valid_inds))
		break

		train_inds = list(itertools.chain.from_iterable(scaffold_sets[c_idx + 1:]))
		return train_inds, valid_inds, []
		train_cutoff = frac_train * len(dataset)
		valid_cutoff = (frac_train + frac_valid) * len(dataset)
		train_inds: List[int] = []
		valid_inds: List[int] = []
		test_inds: List[int] = []

		logger.info("About to sort in scaffold sets")
		for scaffold_set in scaffold_sets:
		if len(train_inds) + len(scaffold_set) > train_cutoff:
		if len(train_inds) + len(valid_inds) + len(scaffold_set) > valid_cutoff:
		test_inds += scaffold_set
		else:
		valid_inds += scaffold_set
		else:
		train_inds += scaffold_set
		return train_inds, valid_inds, test_inds


		def _generate_scaffold(smiles: str, include_chirality: bool = False) -> str:

deepchem/splits/tests/test_splitter.py

+3 −3

Original line number	Diff line number	Diff line
		@@ -204,9 +204,9 @@ class TestSplitter(unittest.TestCase):
		train_data, valid_data, test_data = \
		butina_splitter.train_valid_test_split(
		solubility_dataset)
		assert len(train_data) == 7
		assert len(valid_data) == 3
		assert len(test_data) == 0
		assert len(train_data) == 8
		assert len(valid_data) == 1
		assert len(test_data) == 1

		def test_k_fold_splitter(self):
		"""

examples/tutorials/08_Working_With_Splitters.ipynb

+15 −7

Original line number	Diff line number	Diff line
		%% Cell type:markdown id: tags:

		# Tutorial Part 8: Working With Splitters

		When using machine learning, you typically divide your data into training, validation, and test sets. The MoleculeNet loaders do this automatically. But how should you divide up the data? This question seems simple at first, but it turns out to be quite complicated. There are many ways of splitting up data, and which one you choose can have a big impact on the reliability of your results. This tutorial introduces some of the splitting methods provided by DeepChem.

		## Colab

		This tutorial and the rest in this sequence can be done in Google colab. If you'd like to open this notebook in colab, you can use the following link.

		[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepchem/deepchem/blob/master/examples/tutorials/08_Working_With_Splitters.ipynb)

		## Setup

		To run DeepChem within Colab, you'll need to run the following installation commands. This will take about 5 minutes to run to completion and install your environment. You can of course run this tutorial locally if you prefer. In that case, don't run these cells since they will download and install Anaconda on your local machine.

		%% Cell type:code id: tags:

		``` python
		!curl -Lo conda_installer.py https://raw.githubusercontent.com/deepchem/deepchem/master/scripts/colab_install.py
		import conda_installer
		conda_installer.install()
		!/root/miniconda/bin/conda info -e
		```

		%% Cell type:code id: tags:

		``` python
		!pip install --pre deepchem
		import deepchem
		deepchem.__version__
		```

		%% Cell type:markdown id: tags:

		## Splitters

		In DeepChem, a method of splitting samples into multiple datasets is defined by a `Splitter` object. Choosing an appropriate method for your data is very important. Otherwise, your trained model may seem to work much better than it really does.

		Consider a typical drug development pipeline. You might begin by screening many thousands of molecules to see if they bind to your target of interest. Once you find one that seems to work, you try to optimize it by testing thousands of minor variations on it, looking for one that binds more strongly. Then perhaps you test it in animals and find it has unacceptable toxicity, so you try more variations to fix the problems.

		This has an important consequence for chemical datasets: they often include lots of molecules that are very similar to each other. If you split the data into training and test sets in a naive way, the training set will include many molecules that are very similar to the ones in the test set, even if they are not exactly identical. As a result, the model may do very well on the test set, but then fail badly when you try to use it on other data that is less similar to the training data.

		Let's take a look at a few of the splitters found in DeepChem.

		### RandomSplitter

		This is one of the simplest splitters. It just selects samples for the training, validation, and test sets in a completely random way.

		Didn't we just say that's a bad idea? Well, it depends on your data. If every sample is truly independent of every other, then this is just as good a way as any to split the data. There is no universally best choice of splitter. It all depends on your particular dataset, and for some datasets this is a fine choice.

		### RandomStratifiedSplitter

		Some datasets are very unbalanced: only a tiny fraction of all samples are positive. In that case, random splitting may sometimes lead to the validation or test set having few or even no positive samples for some tasks. That makes it unable to evaluate performance.

		`RandomStratifiedSplitter` addresses this by dividing up the positive and negative samples evenly. If you ask for a 80/10/10 split, the validation and test sets will contain not just 10% of samples, but also 10% of the positive samples for each task.

		### ScaffoldSplitter

		This splitter tries to address the problem discussed above where many molecules are very similar to each other. It identifies the scaffold that forms the core of each molecule, and ensures that all molecules with the same scaffold are put into the same dataset. This is still not a perfect solution, since two molecules may have different scaffolds but be very similar in other ways, but it usually is a large improvement over random splitting.

		### ButinaSplitter

		This is another splitter that tries to address the problem of similar molecules. It clusters them based on their molecular fingerprints, so that ones with similar fingerprints will tend to be in the same dataset. The time required by this splitting algorithm scales as the square of the number of molecules, so it is mainly useful for small to medium sized datasets.

		### SpecifiedSplitter

		This splitter leaves everything up to the user. You tell it exactly which samples to put in each dataset. This is useful when you know in advance that a particular splitting is appropriate for your data.

		An example is temporal splitting. Consider a research project where you are continually generating and testing new molecules. As you gain more data, you periodically retrain your model on the steadily growing dataset, then use it to predict results for other not yet tested molecules. A good way of validating whether this works is to pick a particular cutoff date, train the model on all data you had at that time, and see how well it predicts other data that was generated later.

		## Effect of Using Different Splitters

		Let's look at an example. We will load the Tox21 toxicity dataset using both random and scaffold splitting. For each one we train a model and evaluate it on the training and test sets.
		Let's look at an example. We will load the Tox21 toxicity dataset using random, scaffold, and Butina splitting. For each one we train a model and evaluate it on the training and test sets.

		%% Cell type:code id: tags:

		``` python
		import deepchem as dc

		splitters = ['random', 'scaffold']
		splitters = ['random', 'scaffold', 'butina']
		metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
		for splitter in splitters:
		tasks, datasets, transformers = dc.molnet.load_tox21(featurizer='ECFP', split=splitter)
		train_dataset, valid_dataset, test_dataset = datasets
		model = dc.models.MultitaskClassifier(n_tasks=len(tasks), n_features=1024, layer_sizes=[1000])
		model.fit(train_dataset, nb_epoch=10)
		print('splitter:', splitter)
		print('training set score:', model.evaluate(train_dataset, [metric], transformers))
		print('test set score:', model.evaluate(test_dataset, [metric], transformers))
		print()
		```

		%% Output

		splitter: random
		training set score: {'roc_auc_score': 0.955262277942416}
		test set score: {'roc_auc_score': 0.7822195797170739}
		training set score: {'roc_auc_score': 0.9560766203173238}
		test set score: {'roc_auc_score': 0.8088861019955839}

		splitter: scaffold
		training set score: {'roc_auc_score': 0.9589920031585532}
		test set score: {'roc_auc_score': 0.6864850510346351}
		training set score: {'roc_auc_score': 0.9582835670901536}
		test set score: {'roc_auc_score': 0.6803307954037949}

		splitter: butina
		training set score: {'roc_auc_score': 0.9578120869103354}
		test set score: {'roc_auc_score': 0.6057007877463954}


		%% Cell type:markdown id: tags:

		Both of them produce very similar performance on the training set, but the random splitter has much higher performance on the test set. Does that mean random splitting is better? No! It means random splitting doesn't give you an accurate measure of how well your model works. Because the test set contains lots of molecules that are very similar to ones in the training set, it isn't truly independent. It makes the model appear to work better than it really does. Scaffold splitting gives a better indication of what you can expect on independent data in the future.
		All of them produce very similar performance on the training set, but the random splitter has much higher performance on the test set. Scaffold splitting has a lower test set score, and Butina splitting is even lower. Does that mean random splitting is better? No! It means random splitting doesn't give you an accurate measure of how well your model works. Because the test set contains lots of molecules that are very similar to ones in the training set, it isn't truly independent. It makes the model appear to work better than it really does. Scaffold splitting and Butina splitting give a better indication of what you can expect on independent data in the future.

		%% Cell type:markdown id: tags:

		# Congratulations! Time to join the Community!

		Congratulations on completing this tutorial notebook! If you enjoyed working through the tutorial, and want to continue working with DeepChem, we encourage you to finish the rest of the tutorials in this series. You can also help the DeepChem community in the following ways:

		## Star DeepChem on [GitHub](https://github.com/deepchem/deepchem)
		This helps build awareness of the DeepChem project and the tools for open source drug discovery that we're trying to build.

		## Join the DeepChem Gitter
		The DeepChem [Gitter](https://gitter.im/deepchem/Lobby) hosts a number of scientists, developers, and enthusiasts interested in deep learning for the life sciences. Join the conversation!

Admin message