Unverified Commit 6aad136f authored by Mufei Li's avatar Mufei Li Committed by GitHub
Browse files

Merge pull request #5 from deepchem/master

Update
parents a5a53f46 9664aeab
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -3,7 +3,7 @@ Imports all submodules
"""

# If you push the tag, please remove `.dev`
__version__ = '2.5.0'
__version__ = '2.6.0.dev'

import deepchem.data
import deepchem.feat
+9 −0
Original line number Diff line number Diff line
@@ -14,6 +14,7 @@ except:
  from collections import Sequence as SequenceCollection
from typing import Sequence, Union
from deepchem.utils.typing import KerasActivationFn, LossFn, OneOrMany
from deepchem.utils.data_utils import load_from_disk, save_to_disk

logger = logging.getLogger(__name__)

@@ -298,3 +299,11 @@ class AtomicConvModel(KerasModel):
        ]
        y_b = np.reshape(y_b, newshape=(batch_size, 1))
        yield (inputs, [y_b], [w_b])

  def save(self):
    """Saves model to disk using joblib."""
    save_to_disk(self.model, self.get_model_filename(self.model_dir))

  def reload(self):
    """Loads model from joblib file on disk."""
    self.model = load_from_disk(self.get_model_filename(self.model_dir))
+67 −0
Original line number Diff line number Diff line
@@ -282,6 +282,73 @@ def test_robust_multitask_classification_reload():
  assert scores[classification_metric.name] > .9


def test_atomic_conv_model_reload():
  from deepchem.models.atomic_conv import AtomicConvModel
  from deepchem.data import NumpyDataset
  model_dir = tempfile.mkdtemp()
  batch_size = 1
  N_atoms = 5

  acm = AtomicConvModel(
      n_tasks=1,
      batch_size=batch_size,
      layer_sizes=[
          1,
      ],
      frag1_num_atoms=5,
      frag2_num_atoms=5,
      complex_num_atoms=10,
      model_dir=model_dir)

  features = []
  frag1_coords = np.random.rand(N_atoms, 3)
  frag1_nbr_list = {0: [], 1: [], 2: [], 3: [], 4: []}
  frag1_z = np.random.randint(10, size=(N_atoms))
  frag2_coords = np.random.rand(N_atoms, 3)
  frag2_nbr_list = {0: [], 1: [], 2: [], 3: [], 4: []}
  frag2_z = np.random.randint(10, size=(N_atoms))
  system_coords = np.random.rand(2 * N_atoms, 3)
  system_nbr_list = {
      0: [],
      1: [],
      2: [],
      3: [],
      4: [],
      5: [],
      6: [],
      7: [],
      8: [],
      9: []
  }
  system_z = np.random.randint(10, size=(2 * N_atoms))

  features.append(
      (frag1_coords, frag1_nbr_list, frag1_z, frag2_coords, frag2_nbr_list,
       frag2_z, system_coords, system_nbr_list, system_z))
  features = np.asarray(features)
  labels = np.random.rand(batch_size)
  dataset = NumpyDataset(features, labels)

  acm.fit(dataset, nb_epoch=1)

  reloaded_model = AtomicConvModel(
      n_tasks=1,
      batch_size=batch_size,
      layer_sizes=[
          1,
      ],
      frag1_num_atoms=5,
      frag2_num_atoms=5,
      complex_num_atoms=10,
      model_dir=model_dir)
  reloaded_model.restore()

  # Check predictions match on random sample
  origpred = acm.predict(dataset)
  reloadpred = reloaded_model.predict(dataset)
  assert np.all(origpred == reloadpred)


def test_normalizing_flow_model_reload():
  """Test that NormalizingFlowModel can be reloaded correctly."""
  from deepchem.models.normalizing_flows import NormalizingFlow, NormalizingFlowModel
+17 −23
Original line number Diff line number Diff line
%% Cell type:markdown id: tags:

# Tutorial 1: The Basic Tools of the Deep Life Sciences
Welcome to DeepChem's introductory tutorial for the deep life sciences. This series of notebooks is a step-by-step guide for you to get to know the new tools and techniques needed to do deep learning for the life sciences. We'll start from the basics, assuming that you're new to machine learning and the life sciences, and build up a repertoire of tools and techniques that you can use to do meaningful work in the life sciences.

**Scope:** This tutorial will encompass both the machine learning and data handling needed to build systems for the deep life sciences.

## Colab

This tutorial and the rest in the sequences are designed to be done in Google colab. If you'd like to open this notebook in colab, you can use the following link.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepchem/deepchem/blob/master/examples/tutorials/01_The_Basic_Tools_of_the_Deep_Life_Sciences.ipynb)


## Why do the DeepChem Tutorial?

**1) Career Advancement:** Applying AI in the life sciences is a booming
industry at present. There are a host of newly funded startups and initiatives
at large pharmaceutical and biotech companies centered around AI. Learning and
mastering DeepChem will bring you to the forefront of this field and will
prepare you to enter a career in this field.

**2) Humanitarian Considerations:** Disease is the oldest cause of human
suffering. From the dawn of human civilization, humans have suffered from pathogens,
cancers, and neurological conditions. One of the greatest achievements of
the last few centuries has been the development of effective treatments for
many diseases. By mastering the skills in this tutorial, you will be able to
stand on the shoulders of the giants of the past to help develop new
medicine.

**3) Lowering the Cost of Medicine:** The art of developing new medicine is
currently an elite skill that can only be practiced by a small core of expert
practitioners. By enabling the growth of open source tools for drug discovery,
you can help democratize these skills and open up drug discovery to more
competition. Increased competition can help drive down the cost of medicine.

## Getting Extra Credit
If you're excited about DeepChem and want to get more involved, there are some things that you can do right now:

* Star DeepChem on GitHub! - https://github.com/deepchem/deepchem
* Join the DeepChem forums and introduce yourself! - https://forum.deepchem.io
* Say hi on the DeepChem gitter - https://gitter.im/deepchem/Lobby
* Make a YouTube video teaching the contents of this notebook.


## Prerequisites

This tutorial sequence will assume some basic familiarity with the Python data science ecosystem. We will assume that you have familiarity with libraries such as Numpy, Pandas, and TensorFlow. We'll provide some brief refreshers on basics through the tutorial so don't worry if you're not an expert.

## Setup

The first step is to get DeepChem up and running. We recommend using Google Colab to work through this tutorial series. You'll need to run the following commands to get DeepChem installed on your colab notebook. Note that this will take something like 5 minutes to run on your colab instance.

%% Cell type:code id: tags:

``` python
!curl -Lo conda_installer.py https://raw.githubusercontent.com/deepchem/deepchem/master/scripts/colab_install.py
import conda_installer
conda_installer.install()
!/root/miniconda/bin/conda info -e
```

%% Cell type:code id: tags:

``` python
!pip install --pre deepchem
```

%% Cell type:markdown id: tags:

You can of course run this tutorial locally if you prefer. In this case, don't run the above cell since it will download and install Anaconda on your local machine. In either case, we can now import the `deepchem` package to play with.

%% Cell type:code id: tags:

``` python
import deepchem as dc
dc.__version__
```

%% Output

    '2.4.0-rc1.dev'
    '2.5.0.dev'

%% Cell type:markdown id: tags:

# Training a Model with DeepChem: A First Example

Deep learning can be used to solve many sorts of problems, but the basic workflow is usually the same.  Here are the typical steps you follow.

1. Select the data set you will train your model on (or create a new data set if there isn't an existing suitable one).
2. Create the model.
3. Train the model on the data.
4. Evaluate the model on an independent test set to see how well it works.
5. Use the model to make predictions about new data.

With DeepChem, each of these steps can be as little as one or two lines of Python code.  In this tutorial we will walk through a basic example showing the complete workflow to solve a real world scientific problem.

The problem we will solve is predicting the solubility of small molecules given their chemical formulas.  This is a very important property in drug development: if a proposed drug isn't soluble enough, you probably won't be able to get enough into the patient's bloodstream to have a therapeutic effect.  The first thing we need is a data set of measured solubilities for real molecules.  One of the core components of DeepChem is MoleculeNet, a diverse collection of chemical and molecular data sets.  For this tutorial, we can use the Delaney solubility data set.

%% Cell type:code id: tags:

``` python
tasks, datasets, transformers = dc.molnet.load_delaney(featurizer='GraphConv')
train_dataset, valid_dataset, test_dataset = datasets
```

%% Cell type:markdown id: tags:

I won't say too much about this code right now.  We will see many similar examples in later tutorials.  There are two details I do want to draw your attention to.  First, notice the `featurizer` argument passed to the `load_delaney()` function.  Molecules can be represented in many ways.  We therefore tell it which representation we want to use, or in more technical language, how to "featurize" the data.  Second, notice that we actually get three different data sets: a training set, a validation set, and a test set.  Each of these serves a different function in the standard deep learning workflow.

Now that we have our data, the next step is to create a model.  We will use a particular kind of model called a "graph convolutional network", or "graphconv" for short.

%% Cell type:code id: tags:

``` python
model = dc.models.GraphConvModel(n_tasks=1, mode='regression', dropout=0.2)
```

%% Cell type:markdown id: tags:

Here again I will not say much about the code.  Later tutorials will give lots more information about `GraphConvModel`, as well as other types of models provided by DeepChem.

We now need to train the model on the data set.  We simply give it the data set and tell it how many epochs of training to perform (that is, how many complete passes through the data to make).

%% Cell type:code id: tags:

``` python
model.fit(train_dataset, nb_epoch=100)
```

%% Output

    /Users/peastman/miniconda3/envs/tf2/lib/python3.7/site-packages/tensorflow/python/framework/indexed_slices.py:434: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
      "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "

    0.1147727108001709

%% Cell type:markdown id: tags:

If everything has gone well, we should now have a fully trained model!  But do we?  To find out, we must evaluate the model on the test set.  We do that by selecting an evaluation metric and calling `evaluate()` on the model.  For this example, let's use the Pearson correlation, also known as r<sup>2</sup>, as our metric.  We can evaluate it on both the training set and test set.

%% Cell type:code id: tags:

``` python
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print("Training set score:", model.evaluate(train_dataset, [metric], transformers))
print("Test set score:", model.evaluate(test_dataset, [metric], transformers))
```

%% Output

    Training set score: {'pearson_r2_score': 0.8914309123616354}
    Test set score: {'pearson_r2_score': 0.7744246373275885}
    Training set score: {'pearson_r2_score': 0.9323622956442351}
    Test set score: {'pearson_r2_score': 0.6898768897014962}

%% Cell type:markdown id: tags:

Notice that it has a higher score on the training set than the test set.  Models usually perform better on the particular data they were trained on than they do on similar but independent data.  This is called "overfitting", and it is the reason it is essential to evaluate your model on an independent test set.

Our model still has quite respectable performance on the test set.  For comparison, a model that produced totally random outputs would have a correlation of 0, while one that made perfect predictions would have a correlation of 1.  Our model does quite well, so now we can use it to make predictions about other molecules we care about.

Since this is just a tutorial and we don't have any other molecules we specifically want to predict, let's just use the first ten molecules from the test set.  For each one we print out the chemical structure (represented as a SMILES string) and the predicted solubility.
Since this is just a tutorial and we don't have any other molecules we specifically want to predict, let's just use the first ten molecules from the test set.  For each one we print out the chemical structure (represented as a SMILES string) and the predicted solubility. To put these predictions in
context, we print out the solubility values from the test set as well.

%% Cell type:code id: tags:

``` python
solubilities = model.predict_on_batch(test_dataset.X[:10])
for molecule, solubility in zip(test_dataset.ids, solubilities):
    print(solubility, molecule)
for molecule, solubility, test_solubility in zip(test_dataset.ids, solubilities, test_dataset.y):
    print(solubility, test_solubility, molecule)
```

%% Output

    [-1.4806377] C1c2ccccc2c3ccc4ccccc4c13
    [0.37774816] COc1ccccc1Cl
    [-1.3225354] COP(=S)(OC)Oc1cc(Cl)c(Br)cc1Cl
    [-0.590009] ClC(Cl)CC(=O)NC2=C(Cl)C(=O)c1ccccc1C2=O
    [-2.0383604] ClC(Cl)C(c1ccc(Cl)cc1)c2ccc(Cl)cc2
    [2.0883522] COC(=O)C=C
    [-0.25627953] CN(C)C(=O)Nc2ccc(Oc1ccc(Cl)cc1)cc2
    [0.97384584] N(=Nc1ccccc1)c2ccccc2
    [-0.40858203] CC(C)c1ccc(C)cc1
    [1.1107407] Oc1c(Cl)cccc1Cl
    [-1.8629359] [-1.60114461] c1cc2ccc3cccc4ccc(c1)c2c34
    [0.6617248] [0.20848251] Cc1cc(=O)[nH]c(=S)[nH]1
    [-0.5705674] [-0.01602738] Oc1ccc(cc1)C2(OC(=O)c3ccccc23)c4ccc(O)cc4
    [-2.0929456] [-2.82191713] c1ccc2c(c1)cc3ccc4cccc5ccc2c3c45
    [-1.4962314] [-0.52891635] C1=Cc2cccc3cccc1c23
    [1.8620405] [1.10168349] CC1CO1
    [-0.5858227] [-0.88987406] CCN2c1ccccc1N(C)C(=S)c3cccnc23
    [-0.9799993] [-0.52649706] CC12CCC3C(CCc4cc(O)ccc34)C2CCC1=O
    [-1.0176951] [-0.76358725] Cn2cc(c1ccccc1)c(=O)c(c2)c3cccc(c3)C(F)(F)F
    [0.05622783] [-0.64020358] ClC(Cl)(Cl)C(NC=O)N1C=CN(C=C1)C(NC=O)C(Cl)(Cl)Cl

%% Cell type:markdown id: tags:

# Congratulations! Time to join the Community!

Congratulations on completing this tutorial notebook! If you enjoyed working through the tutorial, and want to continue working with DeepChem, we encourage you to finish the rest of the tutorials in this series. You can also help the DeepChem community in the following ways:

## Star DeepChem on [GitHub](https://github.com/deepchem/deepchem)
This helps build awareness of the DeepChem project and the tools for open source drug discovery that we're trying to build.

## Join the DeepChem Gitter
The DeepChem [Gitter](https://gitter.im/deepchem/Lobby) hosts a number of scientists, developers, and enthusiasts interested in deep learning for the life sciences. Join the conversation!