Merge branch 'master' of https://github.com/deepchem/deepchem into low_data_updates (ce3478b9) · Commits · 钟慕尧 / deepchem

README.md

+11 −8

Original line number	Diff line number	Diff line
		@@ -17,6 +17,7 @@ Stanford and originally created by [Bharath Ramsundar](http://rbharath.github.io
		* [Contributing to DeepChem](#contributing-to-deepchem)
		* [Code Style Guidelines](#code-style-guidelines)
		* [Documentation Style Guidelines](#documentation-style-guidelines)
		* [Gitter](#gitter)
		* [DeepChem Publications](#deepchem-publications)
		* [Examples](/examples)
		* [About Us](#about-us)
		@@ -29,7 +30,7 @@ Stanford and originally created by [Bharath Ramsundar](http://rbharath.github.io
		* [joblib](https://pypi.python.org/pypi/joblib)
		* [sklearn](https://github.com/scikit-learn/scikit-learn.git)
		* [numpy](https://store.continuum.io/cshop/anaconda/)
		* [keras](http://keras.io)
		* [six](https://pypi.python.org/pypi/six)
		* [mdtraj](http://mdtraj.org/)
		* [tensorflow](https://www.tensorflow.org/)

		@@ -56,11 +57,10 @@ Installation from source is the only currently supported format. ```deepchem```
		conda install joblib
		```

		5. `keras`
		5. `six`
		```bash
		pip install keras
		pip install six
		```
		`deepchem` only supports the `tensorflow` (default) backend for keras.

		6. `mdtraj`
		```bash
		@@ -72,7 +72,7 @@ Installation from source is the only currently supported format. ```deepchem```
		contact your local sysadmin to work out a custom installation. If your
		version of Linux is recent, then the following command will work:
		```
		conda install -c https://conda.anaconda.org/jjhelmus tensorflow
		pip install tensorflow-gpu
		```

		8. `deepchem`: Clone the `deepchem` github repo:
		@@ -332,6 +332,9 @@ Aim for a score of at least 8/10 on contributed files.
		### Documentation Style Guidelines
		DeepChem uses [NumPy style documentation](https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt). Please follow these conventions when documenting code, since we use [Sphinx+Napoleon](http://www.sphinx-doc.org/en/stable/ext/napoleon.html) to automatically generate docs on [deepchem.io](deepchem.io).

		### Gitter
		Join us on gitter at [https://gitter.im/deepchem/Lobby](https://gitter.im/deepchem/Lobby). Probably the easiest place to ask simple questions or float requests for new features.

		## DeepChem Publications
		1. [Computational Modeling of β-secretase 1 (BACE-1) Inhibitors using
		Ligand Based

datasets/chembl_5thresh.csv.gz

0 → 100644

+988 KiB

File added.

No diff preview for this file type.

View file

datasets/chembl_sparse.csv.gz

0 → 100644

+8.11 MiB

File added.

No diff preview for this file type.

View file

examples/chembl/init.py

0 → 100644

+0 −0

Empty file added.

examples/chembl/chembl_datasets.py

0 → 100644

+98 −0

Original line number	Diff line number	Diff line
		"""
		ChEMBL dataset loader.
		"""
		from __future__ import print_function
		from __future__ import division
		from __future__ import unicode_literals

		import os
		import time
		import numpy as np
		import deepchem as dc
		import sys

		sys.path.append(".")
		from chembl_tasks import chembl_tasks

		# Set shard size low to avoid memory problems.
		def load_chembl(shard_size=2000, featurizer="ECFP", set="5thresh", split="random"):
		############################################################## TIMING
		time1 = time.time()
		############################################################## TIMING
		# Set some global variables up top
		current_dir = os.path.dirname(os.path.realpath(__file__))

		# Load dataset
		print("About to load ChEMBL dataset.")
		if split == "year":
		train_datasets, valid_datasets, test_datasets = [], [], []
		train_files = os.path.join(current_dir,
		"year_sets/chembl_%s_ts_train.csv.gz" % set)
		valid_files = os.path.join(current_dir,
		"year_sets/chembl_%s_ts_valid.csv.gz" % set)
		test_files = os.path.join(current_dir,
		"year_sets/chembl_%s_ts_test.csv.gz" % set)
		else:
		dataset_path = os.path.join(
		current_dir, "../../datasets/chembl_%s.csv.gz" % set)

		# Featurize ChEMBL dataset
		print("About to featurize ChEMBL dataset.")
		if featurizer == 'ECFP':
		featurizer = dc.feat.CircularFingerprint(size=1024)
		elif featurizer == 'GraphConv':
		featurizer = dc.feat.ConvMolFeaturizer()

		loader = dc.data.CSVLoader(
		tasks=chembl_tasks, smiles_field="smiles", featurizer=featurizer)

		if split == "year":
		print("Featurizing train datasets")
		train_dataset = loader.featurize(
		train_files, shard_size=shard_size)

		print("Featurizing valid datasets")
		valid_dataset = loader.featurize(
		valid_files, shard_size=shard_size)

		print("Featurizing test datasets")
		test_dataset = loader.featurize(
		test_files, shard_size=shard_size)
		else:
		dataset = loader.featurize(dataset_path, shard_size=shard_size)

		# Initialize transformers
		print("About to transform data")
		if split == "year":
		transformers = [
		dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]
		for transformer in transformers:
		for dataset in [train_dataset, valid_dataset, test_dataset]:
		transformer.transform(dataset)
		else:
		transformers = [
		dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)]
		for transformer in transformers:
		dataset = transformer.transform(dataset)

		splitters = {'index': dc.splits.IndexSplitter(),
		'random': dc.splits.RandomSplitter(),
		'scaffold': dc.splits.ScaffoldSplitter()}
		if split in splitters:
		splitter = splitters[split]
		print("Performing new split.")
		train, valid, test = splitter.train_valid_test_split(dataset)
		elif split == "year":
		print("Featurizing train datasets")
		train = loader.featurize(
		train_files, shard_size=shard_size)

		print("Featurizing valid datasets")
		valid = loader.featurize(
		valid_files, shard_size=shard_size)

		print("Featurizing test datasets")
		test = loader.featurize(
		test_files, shard_size=shard_size)

		return chembl_tasks, (train, valid, test), transformers

Admin message