Merge branch 'master' into tf2 (83ab8c3c) · Commits · 钟慕尧 / deepchem

deepchem/splits/splitters.py

+30 −20

Original line number	Diff line number	Diff line
		@@ -852,7 +852,6 @@ class ScaffoldSplitter(Splitter):

		def split(self,
		dataset,
		seed=None,
		frac_train=.8,
		frac_valid=.1,
		frac_test=.1,
		@@ -861,36 +860,47 @@ class ScaffoldSplitter(Splitter):
		Splits internal compounds into train/validation/test by scaffold.
		"""
		np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
		scaffold_sets = self.generate_scaffolds(dataset)

		train_cutoff = frac_train * len(dataset)
		valid_cutoff = (frac_train + frac_valid) * len(dataset)
		train_inds, valid_inds, test_inds = [], [], []

		log("About to sort in scaffold sets", self.verbose)
		for scaffold_set in scaffold_sets:
		if len(train_inds) + len(scaffold_set) > train_cutoff:
		if len(train_inds) + len(valid_inds) + len(scaffold_set) > valid_cutoff:
		test_inds += scaffold_set
		else:
		valid_inds += scaffold_set
		else:
		train_inds += scaffold_set
		return train_inds, valid_inds, test_inds

		def generate_scaffolds(self, dataset, log_every_n=1000):
		"""
		Returns all scaffolds from the dataset
		"""
		scaffolds = {}
		log("About to generate scaffolds", self.verbose)
		data_len = len(dataset)

		log("About to generate scaffolds", self.verbose)
		for ind, smiles in enumerate(dataset.ids):
		if ind % log_every_n == 0:
		log("Generating scaffold %d/%d" % (ind, data_len), self.verbose)
		log(f"Generating scaffold {ind} {data_len}", self.verbose)
		scaffold = generate_scaffold(smiles)
		if scaffold not in scaffolds:
		scaffolds[scaffold] = [ind]
		else:
		scaffolds[scaffold].append(ind)

		# Sort from largest to smallest scaffold sets
		scaffolds = {key: sorted(value) for key, value in scaffolds.items()}
		scaffold_sets = [
		scaffold_set for (scaffold, scaffold_set) in sorted(
		scaffolds.items(), key=lambda x: (len(x[1]), x[1][0]), reverse=True)
		]
		train_cutoff = frac_train * len(dataset)
		valid_cutoff = (frac_train + frac_valid) * len(dataset)
		train_inds, valid_inds, test_inds = [], [], []
		log("About to sort in scaffold sets", self.verbose)
		for scaffold_set in scaffold_sets:
		if len(train_inds) + len(scaffold_set) > train_cutoff:
		if len(train_inds) + len(valid_inds) + len(scaffold_set) > valid_cutoff:
		test_inds += scaffold_set
		else:
		valid_inds += scaffold_set
		else:
		train_inds += scaffold_set
		return train_inds, valid_inds, test_inds
		return scaffold_sets


		class FingerprintSplitter(Splitter):

deepchem/splits/test_scaffold_splitter.py

0 → 100644

+27 −0

Original line number	Diff line number	Diff line
		import unittest
		from unittest import TestCase

		import numpy as np
		import deepchem as dc
		from deepchem.splits.splitters import ScaffoldSplitter


		class TestScaffoldSplitter(TestCase):

		def test_scaffolds(self):
		tox21_tasks, tox21_datasets, transformers = \
		dc.molnet.load_tox21(featurizer='GraphConv')
		train_dataset, valid_dataset, test_dataset = tox21_datasets

		splitter = ScaffoldSplitter()
		scaffolds_separate = splitter.generate_scaffolds(train_dataset)
		scaffolds_train, scaffolds_valid, _ = splitter.split(train_dataset)

		# The amount of datapoints has to be the same
		data_cnt = sum([len(sfd) for sfd in scaffolds_separate])
		self.assertTrue(data_cnt == train_dataset.X.shape[0])

		# The number of scaffolds generated by the splitter
		# has to be smaller or equal than number of total molecules
		scaffolds_separate_cnt = len(scaffolds_separate)
		self.assertTrue(scaffolds_separate_cnt <= train_dataset.X.shape[0])

examples/tutorials/The_Basic_Tools_of_the_Deep_Life_Sciences.ipynb→examples/tutorials/01_The_Basic_Tools_of_the_Deep_Life_Sciences.ipynb

+55 −8

Original line number	Diff line number	Diff line
		%% Cell type:markdown id: tags:

		# Tutorial: Deep Life Sciences
		Welcome to DeepChem's introductory tutorial for the deep life sciences. This series of notebooks is step-by-step guide for you to get to know the new tools and techniques needed to do deep learning for the life sciences.
		# Tutorial 1: The Basic Tools of the Deep Life Sciences
		Welcome to DeepChem's introductory tutorial for the deep life sciences. This series of notebooks is step-by-step guide for you to get to know the new tools and techniques needed to do deep learning for the life sciences. We'll start from the basics, assuming that you're new to machine learning and the life sciences, and build up a repertoire of tools and techniques that you can use to do meaningful work in the life sciences.

		Scope: This tutorial will encompass both the machine learning and data handling needed to build systems for the deep life sciences.

		## Colab

		This tutorial and the rest in the sequences are designed to be done in Google colab. If you'd like to open this notebook in colab, you can use the following link.

		[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepchem/deepchem/blob/master/examples/tutorials/01_The_Basic_Tools_of_the_Deep_Life_Sciences.ipynb)

		## Outline
		* Part 1: The Basic Tools of the Deep Life Sciences
		* Part 2: Introduction to Molecular Data Handling
		* Part 3: Molecular Machine Learning
		* Part 4:

		## Why do the DeepChem Tutorial?

		1) Career Advancement: Applying AI in the life sciences is a booming
		industry at present. There are a host of newly funded startups and initiatives
		at large pharmaceutical and biotech companies centered around AI. Learning and
		mastering DeepChem will bring you to the forefront of this field and will
		prepare you to enter a career in this field.

		2) Humanitarian Considerations: Disease is the oldest cause of human
		suffering. From the dawn of human civilization, humans have suffered from pathogens,
		cancers, and neurological conditions. One of the greatest achievements of
		the last few centuries has been the development of effective treatments for
		many diseases. By mastering the skills in this tutorial, you will be able to
		stand on the shoulders of the giants of the past to help develop new
		medicine.

		3) Lowering the Cost of Medicine: The art of developing new medicine is
		currently an elite skill that can only be practiced by a small core of expert
		practitioners. By enabling the growth of open source tools for drug discovery,
		you can help democratize these skills and open up drug discovery to more
		competition. Increased competition can help drive down the cost of medicine.

		## Getting Extra Credit
		If you're excited about DeepChem and want to get more more involved, there's a couple of things that you can do right now:
		* Start DeepChem on GitHub! - https://github.com/deepchem/deepchem

		* Star DeepChem on GitHub! - https://github.com/deepchem/deepchem
		* Join the DeepChem forums and introduce yourself! - https://forum.deepchem.io
		* Say hi on the DeepChem gitter - https://gitter.im/deepchem/Lobby
		* Make a YouTube video teaching the contents of this notebook.


		## Part -1: Prerequisites

		This tutorial will assume some basic familiarity with the Python data science ecosystem. We will assume that you have familiarity with libraries such as Numpy, Pandas, and TensorFlow.

		## Part 0: Setup
		The first step is to get DeepChem up and running. We recommend using conda for now to do this install.
		```
		conda install -c deepchem -c rdkit -c conda-forge -c omnia deepchem=2.1.0

		The first step is to get DeepChem up and running. We recommend using Google Colab to work through this tutorial series. You'll need to run the following commands to get DeepChem installed on your colab notebook. Note that this will take something like 5 minutes to run on your colab instance.

		%% Cell type:code id: tags:

		``` python
		!wget -c https://repo.anaconda.com/archive/Anaconda3-2019.10-Linux-x86_64.sh
		!chmod +x Anaconda3-2019.10-Linux-x86_64.sh
		!bash ./Anaconda3-2019.10-Linux-x86_64.sh -b -f -p /usr/local
		!conda install -y -c deepchem -c rdkit -c conda-forge -c omnia deepchem-gpu=2.3.0
		import sys
		sys.path.append('/usr/local/lib/python3.7/site-packages/')
		```

		%% Cell type:markdown id: tags:

		You can of course run this tutorial locally if you prefer. In this case, don't run the above cell since it will download and install Anaconda on your local machine. In either case, we can now import `deepchem` the package to play with.

		%% Cell type:code id: tags:

		``` python
		# Run this cell to see if things work
		import deepchem as dc
		```

		%% Output

		/home/bharath/anaconda3/envs/deepchem/lib/python3.5/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
		from numpy.core.umath_tests import inner1d
		/Users/bharath/opt/anaconda3/envs/deepchem/lib/python3.6/site-packages/sklearn/externals/joblib/__init__.py:15: FutureWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.
		warnings.warn(msg, category=FutureWarning)
		RDKit WARNING: [16:55:20] Enabling RDKit 2019.09.3 jupyter extensions
		/Users/bharath/opt/anaconda3/envs/deepchem/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:516: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
		_np_qint8 = np.dtype([("qint8", np.int8, 1)])
		/Users/bharath/opt/anaconda3/envs/deepchem/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:517: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
		_np_quint8 = np.dtype([("quint8", np.uint8, 1)])
		/Users/bharath/opt/anaconda3/envs/deepchem/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:518: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
		_np_qint16 = np.dtype([("qint16", np.int16, 1)])
		/Users/bharath/opt/anaconda3/envs/deepchem/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:519: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
		_np_quint16 = np.dtype([("quint16", np.uint16, 1)])
		/Users/bharath/opt/anaconda3/envs/deepchem/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:520: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
		_np_qint32 = np.dtype([("qint32", np.int32, 1)])
		/Users/bharath/opt/anaconda3/envs/deepchem/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
		np_resource = np.dtype([("resource", np.ubyte, 1)])
		/Users/bharath/opt/anaconda3/envs/deepchem/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
		_np_qint8 = np.dtype([("qint8", np.int8, 1)])
		/Users/bharath/opt/anaconda3/envs/deepchem/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
		_np_quint8 = np.dtype([("quint8", np.uint8, 1)])
		/Users/bharath/opt/anaconda3/envs/deepchem/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
		_np_qint16 = np.dtype([("qint16", np.int16, 1)])
		/Users/bharath/opt/anaconda3/envs/deepchem/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
		_np_quint16 = np.dtype([("quint16", np.uint16, 1)])
		/Users/bharath/opt/anaconda3/envs/deepchem/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
		_np_qint32 = np.dtype([("qint32", np.int32, 1)])
		/Users/bharath/opt/anaconda3/envs/deepchem/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
		np_resource = np.dtype([("resource", np.ubyte, 1)])

		%% Cell type:markdown id: tags:

		## The Basic Tools of the Deep Life Sciences
		What does it take to do deep learning on the life sciences? Well, the first thing we'll need to do is actually handle some data. How can we start handling some basic data? For beginners, let's just take a look at some synthetic data.

		To generate some basic synthetic data, we will use Numpy to create some basic arrays.

		%% Cell type:code id: tags:

		``` python
		import numpy as np

		data = np.random.random((4, 4))
		labels = np.random.random((4,)) # labels of size 20x1
		```

		%% Cell type:markdown id: tags:

		We've given these arrays some evocative names: "data" and "labels." For now, don't worry too much about the names, but just note that the arrays have different shapes. Let's take a quick look to get a feeling for these arrays

		%% Cell type:code id: tags:

		``` python
		data, labels
		```

		%% Output

		(array([[0.17153735, 0.72653504, 0.75818459, 0.64997769],
		[0.64356789, 0.37895973, 0.46143683, 0.3251195 ],
		[0.51409105, 0.20522909, 0.29532684, 0.35239749],
		[0.49242761, 0.62127102, 0.77898693, 0.90960543]]),
		array([0.01939268, 0.43336842, 0.91222562, 0.23498551]))

		%% Cell type:markdown id: tags:

		In order to be able to work with this data in DeepChem, we need to wrap these arrays so DeepChem knows how to work with them. DeepChem has a `Dataset` API that it uses to facilitate its handling of datasets. For handling of Numpy datasets, we use DeepChem's `NumpyDataset` object.

		%% Cell type:code id: tags:

		``` python
		from deepchem.data.datasets import NumpyDataset

		dataset = NumpyDataset(data, labels)
		```

		%% Cell type:markdown id: tags:

		Ok, now what? We have these arrays in a `NumpyDataset` object. What can we do with it? Let's try printing out the object.

		%% Cell type:code id: tags:

		``` python
		dataset
		```

		%% Output

		<deepchem.data.datasets.NumpyDataset at 0x7ff02682c710>

		%% Cell type:markdown id: tags:

		Ok, that's not terribly informative. It's telling us that `dataset` is a Python object that lives somewhere in memory. Can we recover the two datasets that we used to construct this object? Luckily, the DeepChem API allows us to recover the two original datasets by calling the `dataset.X` and `dataset.y` attributes of the original object.

		%% Cell type:code id: tags:

		``` python
		dataset.X, dataset.y
		```

		%% Output

		(array([[0.17153735, 0.72653504, 0.75818459, 0.64997769],
		[0.64356789, 0.37895973, 0.46143683, 0.3251195 ],
		[0.51409105, 0.20522909, 0.29532684, 0.35239749],
		[0.49242761, 0.62127102, 0.77898693, 0.90960543]]),
		array([0.01939268, 0.43336842, 0.91222562, 0.23498551]))

		%% Cell type:markdown id: tags:

		This set of transformations raises a few questions. First, what was the point of it all? Why would we want to wrap objects this way instead of working with the raw Numpy arrays? The simple answer is for have a unified API for working with larger datasets. Suppose that `X` and `y` are so large that they can't fit easily into memory. What would we do then? Being able to work with an abstract `dataset` object proves very convenient then. In fact, you'll have reason to use this feature of `Dataset` later in the tutorial series.

		What else can we do with the `dataset` object? It turns out that it can be useful to be able to walk through the datapoints in the `dataset` one at a time. For that, we can use the `dataset.itersamples()` method.

		%% Cell type:code id: tags:

		``` python
		for x, y, _, _ in dataset.itersamples():
		print(x, y)
		```

		%% Output

		[0.17153735 0.72653504 0.75818459 0.64997769] 0.019392679983928796
		[0.64356789 0.37895973 0.46143683 0.3251195 ] 0.43336841680990135
		[0.51409105 0.20522909 0.29532684 0.35239749] 0.9122256174354443
		[0.49242761 0.62127102 0.77898693 0.90960543] 0.23498551323364447

		%% Cell type:markdown id: tags:

		There are a couple of other fields that the `dataset` object tracks. The first is `dataset.ids`. This is a listing of unique identifiers for the datapoitns in the dataset.

		%% Cell type:code id: tags:

		``` python
		dataset.ids
		```

		%% Output

		array([0, 1, 2, 3], dtype=object)

		%% Cell type:markdown id: tags:

		In addition, the `dataset` object has a field `dataset.w`. This is the "example weight" associated with each datapoint. Since we haven't explicitly assigned the weights, this is simply going to be all ones.

		%% Cell type:code id: tags:

		``` python
		dataset.w
		```

		%% Output

		array([1., 1., 1., 1.])

		%% Cell type:markdown id: tags:

		Alright, we've seen some basic features. What if you want to learn more about `NumpyDataset`? You should check out our more in-depth [notebook](https://deepchem.io/docs/notebooks/Deepchem_NumpyDataset_tutorial.html) that goes into much more depth on how to work with `NumpyDataset` objects.

examples/tutorials/mnist.ipynb→examples/tutorials/02_Learning_MNIST_Digit_Classifiers.ipynb

+15 −1

Original line number	Diff line number	Diff line
		%% Cell type:markdown id: tags:

		# Tutorial Part 2: Learning MNIST Digit Classifiers

		In the previous tutorial, we learned some basics of how to load data into DeepChem and how to use the basic DeepChem objects to load and manipulate this data. In this tutorial, you'll put the parts together and learn how to train a basic image classification model in DeepChem. You might ask, why are we bothering to learn this material in DeepChem? Part of the reason is that image processing is an increasingly important part of AI for the life sciences. So learning how to train image processing models will be very useful for using some of the more advanced DeepChem features.

		The MNIST dataset contains handwritten digits along with their human annotated labels. The learning challenge for this dataset is to train a model that maps the digit image to its true label. MNIST has been a standard benchmark for machine learning for decades at this point.

		![MNIST](mnist_examples.png)

		For convenience, TensorFlow has provided some loader methods to get access to the MNIST dataset. We'll make use of these loaders.
		## Setup

		We recommend running this tutorial on Google colab. You'll need to run the following cell of installation commands on Colab to get your environment set up. If you'd rather run the tutorial locally, make sure you don't run these commands (since they'll download and install a new Anaconda python setup)

		%% Cell type:code id: tags:

		``` python
		!wget -c https://repo.anaconda.com/archive/Anaconda3-2019.10-Linux-x86_64.sh
		!chmod +x Anaconda3-2019.10-Linux-x86_64.sh
		!bash ./Anaconda3-2019.10-Linux-x86_64.sh -b -f -p /usr/local
		!conda install -y -c deepchem -c rdkit -c conda-forge -c omnia deepchem-gpu=2.3.0
		import sys
		sys.path.append('/usr/local/lib/python3.7/site-packages/')
		import deepchem as dc
		```

		%% Cell type:code id: tags:

		``` python
		from tensorflow.examples.tutorials.mnist import input_data
		```

		%% Cell type:code id: tags:

		``` python
		# TODO: This is deprecated. Let's replace with a DeepChem native loader for maintainability.
		mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
		```

		%% Output

		Extracting MNIST_data/train-images-idx3-ubyte.gz
		Extracting MNIST_data/train-labels-idx1-ubyte.gz
		Extracting MNIST_data/t10k-images-idx3-ubyte.gz
		Extracting MNIST_data/t10k-labels-idx1-ubyte.gz

		%% Cell type:code id: tags:

		``` python
		import deepchem as dc
		import tensorflow as tf
		from tensorflow.keras.layers import Reshape, Conv2D, Flatten, Dense, Softmax
		```

		%% Cell type:code id: tags:

		``` python
		train = dc.data.NumpyDataset(mnist.train.images, mnist.train.labels)
		valid = dc.data.NumpyDataset(mnist.validation.images, mnist.validation.labels)
		```

		%% Cell type:code id: tags:

		``` python
		keras_model = tf.keras.Sequential([
		Reshape((28, 28, 1)),
		Conv2D(filters=32, kernel_size=5, activation=tf.nn.relu),
		Conv2D(filters=64, kernel_size=5, activation=tf.nn.relu),
		Flatten(),
		Dense(1024, activation=tf.nn.relu),
		Dense(10),
		Softmax()
		])
		model = dc.models.KerasModel(keras_model, dc.models.losses.CategoricalCrossEntropy())
		```

		%% Cell type:code id: tags:

		``` python
		model.fit(train, nb_epoch=2)
		```

		%% Output

		0.0

		%% Cell type:code id: tags:

		``` python
		from sklearn.metrics import roc_curve, auc
		import numpy as np

		print("Validation")
		prediction = np.squeeze(model.predict_on_batch(valid.X))

		fpr = dict()
		tpr = dict()
		roc_auc = dict()
		for i in range(10):
		fpr[i], tpr[i], thresh = roc_curve(valid.y[:, i], prediction[:, i])
		roc_auc[i] = auc(fpr[i], tpr[i])
		print("class %s:auc=%s" % (i, roc_auc[i]))
		```

		%% Output

		Validation
		class 0:auc=0.9998836328172079
		class 1:auc=0.9999571662641497
		class 2:auc=0.9998310516219043
		class 3:auc=0.9999563446718672
		class 4:auc=0.9999418111793702
		class 5:auc=0.9995639983771051
		class 6:auc=0.9998478260194437
		class 7:auc=0.9998357507660879
		class 8:auc=0.9999599342922393
		class 9:auc=0.9998551553268534

examples/tutorials/graph_convolutional_networks_for_tox21.ipynb→examples/tutorials/03_Introduction_to_Graph_Convolutions.ipynb

+0 −0

Original line number	Diff line number	Diff line

Admin message