Merge branch 'master' into update-dependecies (fcc4278e) · Commits · 钟慕尧 / deepchem

README.md

+2 −1

Original line number	Diff line number	Diff line
		@@ -5,7 +5,7 @@
		[![Anaconda-Server Badge](https://anaconda.org/conda-forge/deepchem/badges/version.svg)](https://anaconda.org/conda-forge/deepchem)
		[![PyPI version](https://badge.fury.io/py/deepchem.svg)](https://badge.fury.io/py/deepchem)

		[Website](https://deepchem.io/) \| [Documentation (master)](https://deepchem.readthedocs.io/en/latest/)) \| [Colab Tutorial](https://github.com/deepchem/deepchem/tree/master/examples/tutorials) \| [Discussion Forum](https://forum.deepchem.io/) \| [Gitter](https://gitter.im/deepchem/Lobby)
		[Website](https://deepchem.io/) \| [Documentation (master)](https://deepchem.readthedocs.io/en/latest/) \| [Colab Tutorial](https://github.com/deepchem/deepchem/tree/master/examples/tutorials) \| [Discussion Forum](https://forum.deepchem.io/) \| [Gitter](https://gitter.im/deepchem/Lobby)

		DeepChem aims to provide a high quality open-source toolchain
		that democratizes the use of deep-learning in drug discovery,
		@@ -58,6 +58,7 @@ DeepChem has a number of "soft" requirements. These are packages which are neede
		- [RDKit](http://www.rdkit.org/docs/Install.html)
		- [simdna](https://github.com/kundajelab/simdna)
		- [XGBoost](https://xgboost.readthedocs.io/en/latest/)
		- [Weights & Biases](https://docs.wandb.com/)
		- [Tensorflow Probability](https://www.tensorflow.org/probability)

		## Installation

deepchem/data/datasets.py

+41 −7

Original line number	Diff line number	Diff line
		@@ -412,8 +412,8 @@ class Dataset(object):

		Returns
		-------
		If `X_stats == True`, returns `(X_means, X_stds)`. If `y_stats ==
		True`, returns `(y_means, y_stds)`. If both are true, returns
		If `X_stats == True`, returns `(X_means, X_stds)`. If `y_stats == True`,
		returns `(y_means, y_stds)`. If both are true, returns
		`(X_means, X_stds, y_means, y_stds)`.
		"""
		X_means = 0.0
		@@ -1160,8 +1160,8 @@ class DiskDataset(Dataset):
		`math.ceil(len(dataset)/batch_size)`. Each minibatch is returned as
		a tuple of four numpy arrays: `(X, y, w, ids)`.

		Parameters:
		-----------
		Parameters
		----------
		batch_size: int
		Number of elements in a batch. If None, then it yields batches
		with size equal to the size of each individual shard.
		@@ -1655,6 +1655,36 @@ class DiskDataset(Dataset):
		return np.array(
		load_from_disk(os.path.join(self.data_dir, row['ids'])), dtype=object)

		def get_shard_y(self, i):
		"""Retrieves the labels for the i-th shard from disk.

		Parameters
		----------
		i: int
		Shard index for shard to retrieve labels from
		"""

		if self._cached_shards is not None and self._cached_shards[i] is not None:
		return self._cached_shards[i].y
		row = self.metadata_df.iloc[i]
		return np.array(
		load_from_disk(os.path.join(self.data_dir, row['y'])), dtype=object)

		def get_shard_w(self, i):
		"""Retrieves the weights for the i-th shard from disk.

		Parameters
		----------
		i: int
		Shard index for shard to retrieve weights from
		"""

		if self._cached_shards is not None and self._cached_shards[i] is not None:
		return self._cached_shards[i].w
		row = self.metadata_df.iloc[i]
		return np.array(
		load_from_disk(os.path.join(self.data_dir, row['w'])), dtype=object)

		def add_shard(self, X, y, w, ids):
		"""Adds a data shard."""
		metadata_rows = self.metadata_df.values.tolist()
		@@ -1758,9 +1788,12 @@ class DiskDataset(Dataset):
		@property
		def y(self):
		"""Get the y vector for this dataset as a single numpy array."""
		if len(self) == 0:
		return np.array([])
		ys = []
		one_dimensional = False
		for (_, y_b, _, _) in self.itershards():
		for i in range(self.get_number_shards()):
		y_b = self.get_shard_y(i)
		ys.append(y_b)
		if len(y_b.shape) == 1:
		one_dimensional = True
		@@ -1774,8 +1807,9 @@ class DiskDataset(Dataset):
		"""Get the weight vector for this dataset as a single numpy array."""
		ws = []
		one_dimensional = False
		for (_, _, w_b, _) in self.itershards():
		ws.append(np.array(w_b))
		for i in range(self.get_number_shards()):
		w_b = self.get_shard_w(i)
		ws.append(w_b)
		if len(w_b.shape) == 1:
		one_dimensional = True
		if not one_dimensional:

deepchem/data/tests/test_property.py

0 → 100644

+30 −0

Original line number	Diff line number	Diff line
		import numpy as np
		import deepchem as dc


		def test_y_property():
		"""Test that dataset.y works."""
		num_datapoints = 10
		num_features = 10
		num_tasks = 1
		X = np.random.rand(num_datapoints, num_features)
		y = np.random.randint(2, size=(num_datapoints, num_tasks))
		w = np.ones((num_datapoints, num_tasks))
		ids = np.array(["id"] * num_datapoints)
		dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids)
		y_out = dataset.y
		np.testing.assert_array_equal(y, y_out)


		def test_w_property():
		"""Test that dataset.y works."""
		num_datapoints = 10
		num_features = 10
		num_tasks = 1
		X = np.random.rand(num_datapoints, num_features)
		y = np.random.randint(2, size=(num_datapoints, num_tasks))
		w = np.ones((num_datapoints, num_tasks))
		ids = np.array(["id"] * num_datapoints)
		dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids)
		w_out = dataset.w
		np.testing.assert_array_equal(w, w_out)

deepchem/dock/binding_pocket.py

+4 −4

Original line number	Diff line number	Diff line
		@@ -18,8 +18,8 @@ logger = logging.getLogger(__name__)
		def extract_active_site(protein_file, ligand_file, cutoff=4):
		"""Extracts a box for the active site.

		Params
		------
		Parameters
		----------
		protein_file: str
		Location of protein PDB
		ligand_file: str
		@@ -116,8 +116,8 @@ class ConvexHullPocketFinder(BindingPocketFinder):
		face of the hull is converted into a coordinate box used for
		binding.

		Params
		------
		Parameters
		----------
		macromolecule_file: str
		Location of the macromolecule file to load

deepchem/dock/pose_generation.py

+2 −2

Original line number	Diff line number	Diff line
		@@ -93,8 +93,8 @@ class VinaPoseGenerator(PoseGenerator):
		def __init__(self, sixty_four_bits=True, pocket_finder=None):
		"""Initializes Vina Pose Generator

		Params
		------
		Parameters
		----------
		sixty_four_bits: bool, optional (default True)
		Specifies whether this is a 64-bit machine. Needed to download
		the correct executable.

Admin message