Commit ce3478b9 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Merge branch 'master' of https://github.com/deepchem/deepchem into low_data_updates

parents e324d525 778e9e5e
Loading
Loading
Loading
Loading
+11 −8
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@ Stanford and originally created by [Bharath Ramsundar](http://rbharath.github.io
* [Contributing to DeepChem](#contributing-to-deepchem)
    * [Code Style Guidelines](#code-style-guidelines)
    * [Documentation Style Guidelines](#documentation-style-guidelines)
    * [Gitter](#gitter)
* [DeepChem Publications](#deepchem-publications)
* [Examples](/examples)
* [About Us](#about-us)
@@ -29,7 +30,7 @@ Stanford and originally created by [Bharath Ramsundar](http://rbharath.github.io
* [joblib](https://pypi.python.org/pypi/joblib)
* [sklearn](https://github.com/scikit-learn/scikit-learn.git)
* [numpy](https://store.continuum.io/cshop/anaconda/)
* [keras](http://keras.io)
* [six](https://pypi.python.org/pypi/six)
* [mdtraj](http://mdtraj.org/)
* [tensorflow](https://www.tensorflow.org/)

@@ -56,11 +57,10 @@ Installation from source is the only currently supported format. ```deepchem```
   conda install joblib 
   ```

5. `keras`
5. `six`
   ```bash
   pip install keras
   pip install six
   ```
   `deepchem` only supports the `tensorflow` (default) backend for keras.
      
6. `mdtraj`
   ```bash
@@ -72,7 +72,7 @@ Installation from source is the only currently supported format. ```deepchem```
    contact your local sysadmin to work out a custom installation. If your
    version of Linux is recent, then the following command will work:
    ```
    conda install -c https://conda.anaconda.org/jjhelmus tensorflow
    pip install tensorflow-gpu
    ```

8. `deepchem`: Clone the `deepchem` github repo:
@@ -332,6 +332,9 @@ Aim for a score of at least 8/10 on contributed files.
### Documentation Style Guidelines
DeepChem uses [NumPy style documentation](https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt). Please follow these conventions when documenting code, since we use [Sphinx+Napoleon](http://www.sphinx-doc.org/en/stable/ext/napoleon.html) to automatically generate docs on [deepchem.io](deepchem.io).

### Gitter
Join us on gitter at [https://gitter.im/deepchem/Lobby](https://gitter.im/deepchem/Lobby). Probably the easiest place to ask simple questions or float requests for new features.

## DeepChem Publications
1. [Computational Modeling of β-secretase 1 (BACE-1) Inhibitors using
Ligand Based
+988 KiB

File added.

No diff preview for this file type.

+8.11 MiB

File added.

No diff preview for this file type.

+0 −0

Empty file added.

+98 −0
Original line number Diff line number Diff line
"""
ChEMBL dataset loader.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import time
import numpy as np
import deepchem as dc
import sys

sys.path.append(".")
from chembl_tasks import chembl_tasks

# Set shard size low to avoid memory problems.
def load_chembl(shard_size=2000, featurizer="ECFP", set="5thresh", split="random"):
    ############################################################## TIMING
    time1 = time.time()
    ############################################################## TIMING
    # Set some global variables up top
    current_dir = os.path.dirname(os.path.realpath(__file__))

    # Load dataset
    print("About to load ChEMBL dataset.")
    if split == "year":
        train_datasets, valid_datasets, test_datasets = [], [], []
        train_files = os.path.join(current_dir,
                                   "year_sets/chembl_%s_ts_train.csv.gz" % set)
        valid_files = os.path.join(current_dir,
                                   "year_sets/chembl_%s_ts_valid.csv.gz" % set)
        test_files = os.path.join(current_dir,
                                  "year_sets/chembl_%s_ts_test.csv.gz" % set)
    else:
        dataset_path = os.path.join(
            current_dir, "../../datasets/chembl_%s.csv.gz" % set)

    # Featurize ChEMBL dataset
    print("About to featurize ChEMBL dataset.")
    if featurizer == 'ECFP':
        featurizer = dc.feat.CircularFingerprint(size=1024)
    elif featurizer == 'GraphConv':
        featurizer = dc.feat.ConvMolFeaturizer()

    loader = dc.data.CSVLoader(
        tasks=chembl_tasks, smiles_field="smiles", featurizer=featurizer)

    if split == "year":
        print("Featurizing train datasets")
        train_dataset = loader.featurize(
            train_files, shard_size=shard_size)

        print("Featurizing valid datasets")
        valid_dataset = loader.featurize(
            valid_files, shard_size=shard_size)

        print("Featurizing test datasets")
        test_dataset = loader.featurize(
            test_files, shard_size=shard_size)
    else:
        dataset = loader.featurize(dataset_path, shard_size=shard_size)

    # Initialize transformers
    print("About to transform data")
    if split == "year":
        transformers = [
            dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]
        for transformer in transformers:
            for dataset in [train_dataset, valid_dataset, test_dataset]:
                transformer.transform(dataset)
    else:
        transformers = [
            dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)]
        for transformer in transformers:
            dataset = transformer.transform(dataset)

    splitters = {'index': dc.splits.IndexSplitter(),
                 'random': dc.splits.RandomSplitter(),
                 'scaffold': dc.splits.ScaffoldSplitter()}
    if split in splitters:
        splitter = splitters[split]
        print("Performing new split.")
        train, valid, test = splitter.train_valid_test_split(dataset)
    elif split == "year":
        print("Featurizing train datasets")
        train = loader.featurize(
            train_files, shard_size=shard_size)

        print("Featurizing valid datasets")
        valid = loader.featurize(
            valid_files, shard_size=shard_size)

        print("Featurizing test datasets")
        test = loader.featurize(
            test_files, shard_size=shard_size)

    return chembl_tasks, (train, valid, test), transformers
Loading