Merge branch 'master' into pdbbind-final (287b457a) · Commits · 钟慕尧 / deepchem

.style.yapf

0 → 100644

+3 −0

Original line number	Diff line number	Diff line
		[style]
		based_on_style = google
		indent_width = 2

.travis.yml

+5 −10

Original line number	Diff line number	Diff line
		@@ -14,19 +14,14 @@ install:
		- conda config --set always_yes yes --set changeps1 no
		- conda update -q conda
		- conda config --add channels http://conda.binstar.org/omnia
		- conda install pandas
		- conda install -c omnia rdkit
		- conda install -c omnia boost=1.59.0
		- conda install -c omnia openbabel
		- conda install joblib
		- conda install h5py
		- pip install six
		- conda install -c conda-forge protobuf=3.1.0
		- conda install -c omnia mdtraj
		- pip install tensorflow==0.12.1
		- bash scripts/install_deepchem_conda.sh deepchem
		- source activate deepchem
		- pip install yapf==0.16.0
		- python setup.py install
		script:
		- nosetests -v deepchem --nologcapture
		- find ./deepchem \| grep .py$ \|xargs python -m doctest -v
		- bash devtools/travis-ci/test_format_code.sh
		after_success:
		- echo $TRAVIS_SECURE_ENV_VARS
		- source devtools/travis-ci/after_sucess.sh

README.md

+12 −14

Original line number	Diff line number	Diff line
		# DeepChem
		# DeepChem

		DeepChem aims to provide a high quality open-source toolchain that
		democratizes the use of deep-learning in drug discovery, materials science, and quantum
		@@ -51,7 +51,7 @@ Installation from source is the only currently supported format. ```deepchem```

		3. `rdkit`
		```bash
		conda install -c omnia rdkit
		conda install -c rdkit rdkit
		```

		4. `joblib`
		@@ -105,14 +105,11 @@ Installation from source is the only currently supported format. ```deepchem```
		Try running tests for one submodule at a time if memory proves an issue.

		### Using a conda environment
		Alternatively, you can install deepchem in a new conda environment using the following bash commands:
		Alternatively, you can install deepchem in a new conda environment using the conda commands in scripts/install_deepchem_conda.sh

		```bash
		conda create -n deepchem python=3.5 -y # Create new env
		source activate deepchem # Activate it
		conda install -c omnia openbabel=2.4.0 rdkit mdtraj -y # Installs from omnia channel
		conda install joblib scikit-learn -y # Installs from default channel
		pip install six tensorflow-gpu nose # Pip installs
		bash scripts/install_deepchem_conda.sh deepchem
		pip install tensorflow-gpu==0.12.1 # If you want GPU support
		git clone https://github.com/deepchem/deepchem.git # Clone deepchem source code from GitHub
		cd deepchem
		python setup.py install # Manual install
		@@ -221,8 +218,8 @@ Random splitting

		\|Dataset \|Model \|Train score/ROC-AUC\|Valid score/ROC-AUC\|
		\|-----------\|--------------------\|-------------------\|-------------------\|
		\|tox21 \|logistic regression \|0.903 \|0.735 \|
		\| \|Multitask network \|0.856 \|0.783 \|
		\|tox21 \|logistic regression \|0.902 \|0.715 \|
		\| \|Multitask network \|0.844 \|0.795 \|
		\| \|robust MT-NN \|0.855 \|0.773 \|
		\| \|graph convolution \|0.865 \|0.827 \|
		\|muv \|logistic regression \|0.957 \|0.719 \|
		@@ -383,11 +380,12 @@ We actively encourage community contributions to DeepChem. The first place to st
		Once you've got a sense of how the package works, we encourage the use of Github issues to discuss more complex changes, raise requests for new features or propose changes to the global architecture of DeepChem. Once consensus is reached on the issue, please submit a PR with proposed modifications. All contributed code to DeepChem will be reviewed by a member of the DeepChem team, so please make sure your code style and documentation style match our guidelines!

		### Code Style Guidelines
		DeepChem broadly follows the [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html). In terms of practical changes, the biggest effect is that all code uses 2-space indents instead of 4-space indents. We encourage new contributors to make use of [pylint](https://www.pylint.org/) with the following command
		```
		pylint --disable=invalid-name --indent-string " " --extension-pkg-whitelist=numpy [file.py]
		DeepChem uses [yapf](https://github.com/google/yapf) to autoformat code. We created a git pre-commit hook to make this process easier.

		``` bash
		cp devtools/travis-ci/pre-commit .git/hooks
		pip install yapf==0.16.0
		```
		Aim for a score of at least 8/10 on contributed files.

		### Documentation Style Guidelines
		DeepChem uses [NumPy style documentation](https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt). Please follow these conventions when documenting code, since we use [Sphinx+Napoleon](http://www.sphinx-doc.org/en/stable/ext/napoleon.html) to automatically generate docs on [deepchem.io](deepchem.io).

deepchem/.gitignore

0 → 100644

+3 −0

Original line number	Diff line number	Diff line
		core_grid.tar.gz
		dock/autodock_vina_1_1_2_linux_x86/
		random_full_DNN.tar.gz

deepchem/data/data_loader.py

+43 −22

Original line number	Diff line number	Diff line
		@@ -20,14 +20,15 @@ from deepchem.utils.save import load_sdf_files
		from deepchem.feat import UserDefinedFeaturizer
		from deepchem.data import DiskDataset


		def convert_df_to_numpy(df, tasks, verbose=False):
		"""Transforms a dataframe containing deepchem input into numpy arrays"""
		n_samples = df.shape[0]
		n_tasks = len(tasks)

		time1 = time.time()
		y = np.hstack([
		np.reshape(np.array(df[task].values), (n_samples, 1)) for task in tasks])
		y = np.hstack(
		[np.reshape(np.array(df[task].values), (n_samples, 1)) for task in tasks])
		time2 = time.time()

		w = np.ones((n_samples, n_tasks))
		@@ -49,6 +50,7 @@ def convert_df_to_numpy(df, tasks, verbose=False):

		return y.astype(float), w.astype(float)


		def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True):
		"""Featurize individual compounds in dataframe.

		@@ -64,11 +66,12 @@ def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True):
		if ind % log_every_N == 0:
		log("Featurizing sample %d" % ind, verbose)
		features.append(featurizer.featurize([mol]))
		valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features],
		dtype=bool)
		valid_inds = np.array(
		[1 if elt.size > 0 else 0 for elt in features], dtype=bool)
		features = [elt for (is_valid, elt) in zip(valid_inds, features) if is_valid]
		return np.squeeze(np.array(features)), valid_inds


		def get_user_specified_features(df, featurizer, verbose=True):
		"""Extract and merge user specified features.

		@@ -86,12 +89,15 @@ def get_user_specified_features(df, featurizer, verbose=True):

		"""
		time1 = time.time()
		df[featurizer.feature_fields] = df[featurizer.feature_fields].apply(pd.to_numeric)
		df[featurizer.feature_fields] = df[featurizer.feature_fields].apply(
		pd.to_numeric)
		X_shard = df.as_matrix(columns=featurizer.feature_fields)
		time2 = time.time()
		log("TIMING: user specified processing took %0.3f s" % (time2-time1), verbose)
		log("TIMING: user specified processing took %0.3f s" % (time2 - time1),
		verbose)
		return X_shard


		def featurize_mol_df(df, featurizer, field, verbose=True, log_every_N=1000):
		"""Featurize individual compounds in dataframe.

		@@ -108,11 +114,12 @@ def featurize_mol_df(df, featurizer, field, verbose=True, log_every_N=1000):
		if ind % log_every_N == 0:
		log("Featurizing sample %d" % ind, verbose)
		features.append(featurizer.featurize([mol]))
		valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features],
		dtype=bool)
		valid_inds = np.array(
		[1 if elt.size > 0 else 0 for elt in features], dtype=bool)
		features = [elt for (is_valid, elt) in zip(valid_inds, features) if is_valid]
		return np.squeeze(np.array(features)), valid_inds


		class DataLoader(object):
		"""
		Handles loading/featurizing of chemical samples (datapoints).
		@@ -121,9 +128,14 @@ class DataLoader(object):
		dataframe object to disk as output.
		"""

		def __init__(self, tasks, smiles_field=None,
		id_field=None, mol_field=None, featurizer=None,
		verbose=True, log_every_n=1000):
		def __init__(self,
		tasks,
		smiles_field=None,
		id_field=None,
		mol_field=None,
		featurizer=None,
		verbose=True,
		log_every_n=1000):
		"""Extracts data from input as Pandas data frame"""
		if not isinstance(tasks, list):
		raise ValueError("tasks must be a list.")
		@@ -148,8 +160,10 @@ class DataLoader(object):

		if not isinstance(input_files, list):
		input_files = [input_files]

		def shard_generator():
		for shard_num, shard in enumerate(self.get_shards(input_files, shard_size)):
		for shard_num, shard in enumerate(
		self.get_shards(input_files, shard_size)):
		time1 = time.time()
		X, valid_inds = self.featurize_shard(shard)
		ids = shard[self.id_field].values
		@@ -167,10 +181,12 @@ class DataLoader(object):
		assert len(X) == len(ids)

		time2 = time.time()
		log("TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2-time1),
		self.verbose)
		log("TIMING: featurizing shard %d took %0.3f s" %
		(shard_num, time2 - time1), self.verbose)
		yield X, y, w, ids
		return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)

		return DiskDataset.create_dataset(
		shard_generator(), data_dir, self.tasks, verbose=self.verbose)

		def get_shards(self, input_files, shard_size):
		"""Stub for children classes."""
		@@ -180,23 +196,26 @@ class DataLoader(object):
		"""Featurizes a shard of an input dataframe."""
		raise NotImplementedError


		class CSVLoader(DataLoader):
		"""
		Handles loading of CSV files.
		"""

		def get_shards(self, input_files, shard_size, verbose=True):
		"""Defines a generator which returns data for each shard"""
		return load_csv_files(input_files, shard_size, verbose=verbose)

		def featurize_shard(self, shard):
		"""Featurizes a shard of an input dataframe."""
		return featurize_smiles_df(shard, self.featurizer,
		field=self.smiles_field)
		return featurize_smiles_df(shard, self.featurizer, field=self.smiles_field)


		class UserCSVLoader(DataLoader):
		"""
		Handles loading of CSV files with user-defined featurizers.
		"""

		def get_shards(self, input_files, shard_size):
		"""Defines a generator which returns data for each shard"""
		return load_csv_files(input_files, shard_size)
		@@ -207,16 +226,18 @@ class UserCSVLoader(DataLoader):
		X = get_user_specified_features(shard, self.featurizer)
		return (X, np.ones(len(X), dtype=bool))


		class SDFLoader(DataLoader):
		"""
		Handles loading of SDF files.
		"""

		def get_shards(self, input_files, shard_size):
		"""Defines a generator which returns data for each shard"""
		return load_sdf_files(input_files)

		def featurize_shard(self, shard):
		"""Featurizes a shard of an input dataframe."""
		log("Currently featurizing feature_type: %s"
		% self.featurizer.__class__.__name__, self.verbose)
		log("Currently featurizing feature_type: %s" %
		self.featurizer.__class__.__name__, self.verbose)
		return featurize_mol_df(shard, self.featurizer, field=self.mol_field)

Admin message