Merge branch 'master' into fix-base-classes-feat (88dd25b5) · Commits · 钟慕尧 / deepchem

.travis.yml

+1 −1

Original line number	Diff line number	Diff line
		@@ -30,7 +30,7 @@ install:
		- hash -r
		- conda config --set always_yes yes --set changeps1 no
		- conda update -q conda
		- bash scripts/install_deepchem_conda.sh deepchem
		- bash scripts/install_deepchem_conda.sh cpu
		- conda activate deepchem
		- python setup.py install
		script:

deepchem/data/datasets.py

+91 −38

Original line number	Diff line number	Diff line
		@@ -1352,7 +1352,7 @@ class DiskDataset(Dataset):
		w_next = np.zeros((0,) + w_shape[1:])
		ids_next = np.zeros((0,), dtype=object)
		for shard_num, (X, y, w, ids) in enumerate(self.itershards()):
		logger.info("Resharding shard %d/%d" % (shard_num, n_shards))
		logger.info("Resharding shard %d/%d" % (shard_num + 1, n_shards))
		# Handle shapes
		X = np.reshape(X, (len(X),) + self.get_data_shape())
		# Note that this means that DiskDataset resharding currently doesn't
		@@ -1816,8 +1816,18 @@ class DiskDataset(Dataset):
		def sparse_shuffle(self) -> None:
		"""Shuffling that exploits data sparsity to shuffle large datasets.

		Only for 1-dimensional feature vectors (does not work for tensorial
		featurizations).
		If feature vectors are sparse, say circular fingerprints or any other
		representation that contains few nonzero values, it can be possible to
		exploit the sparsity of the vector to simplify shuffles. This method
		implements a sparse shuffle by compressing sparse feature vectors down
		into a compressed representation, then shuffles this compressed dataset in
		memory and writes the results to disk.

		Note
		----
		This method only works for 1-dimensional feature vectors (does not work
		for tensorial featurizations). Note that this shuffle is performed in
		place.
		"""
		time1 = time.time()
		shard_size = self.get_shard_size()
		@@ -1855,52 +1865,84 @@ class DiskDataset(Dataset):
		logger.info("TIMING: sparse_shuffle took %0.3f s" % (time2 - time1))

		def complete_shuffle(self, data_dir: Optional[str] = None) -> "DiskDataset":
		"""
		Completely shuffle across all data, across all shards.
		"""Completely shuffle across all data, across all shards.

		Note: this loads all the data into ram, and can be prohibitively
		expensive for larger datasets.
		Note
		----
		The algorithm used for this complete shuffle is O(N^2) where N is the
		number of shards. It simply constructs each shard of the output dataset
		one at a time. Since the complete shuffle can take a long time, it's
		useful to watch the logging output. Each shuffled shard is constructed
		using select() which logs as it selects from each original shard. This
		will results in O(N^2) logging statements, one for each extraction of
		shuffled shard i's contributions from original shard j.

		Parameters
		----------
		shard_size: int
		size of the resulting dataset's size. If None, then the first
		shard's shard_size will be used.
		data_dir: Optional[str], (default None)
		Directory to write the shuffled dataset to. If none is specified a
		temporary directory will be used.

		Returns
		-------
		DiskDataset
		A DiskDataset with a single shard.

		A DiskDataset whose data is a randomly shuffled version of this dataset.
		"""
		# Create temp directory to store shuffled version
		shuffle_dir = tempfile.mkdtemp()
		n_shards = self.get_number_shards()
		N = len(self)
		perm = np.random.permutation(N)
		shard_size = self.get_shard_size()

		def generator():
		start = 0
		shard_num = 0
		while start < N:
		logger.info("Constructing shard %d" % shard_num)
		if start + shard_size < N:
		end = start + shard_size
		else:
		end = N
		shard_indices = perm[start:end]
		# Note that this is in sorted order which doesn't respect the random
		# permutation.
		shard_dataset = self.select(shard_indices)
		# One bit of trickiness here is that select() will return in sorted
		# order. For example, suppose we'd like these elements in our permuted
		# shard:
		#
		# [12, 234, 1, 4]
		#
		# Then select would return elements in order
		#
		# [1, 4, 12, 234]
		#
		# We need to recover the original ordering. We can do this by using
		# np.where to find the locatios of the original indices in the sorted
		# indices.
		sorted_indices = np.array(sorted(shard_indices))
		reverted_indices = np.array(
		# We know there's only one match for np.where since this is a
		# permutation, so the [0][0] pulls out the exact match location.
		[
		np.where(sorted_indices == orig_index)[0][0]
		for orig_index in shard_indices
		])
		# Let's pull out shard elements
		shard_X, shard_y, shard_w, shard_ids = (shard_dataset.X,
		shard_dataset.y,
		shard_dataset.w,
		shard_dataset.ids)

		yield (shard_X[reverted_indices], shard_y[reverted_indices],
		shard_w[reverted_indices], shard_ids[reverted_indices])

		start = end
		shard_num += 1

		all_X = []
		all_y = []
		all_w = []
		all_ids = []
		for Xs, ys, ws, ids in self.itershards():
		all_X.append(Xs)
		if ys is not None:
		all_y.append(ys)
		if ws is not None:
		all_w.append(ws)
		all_ids.append(ids)

		Xs = np.concatenate(all_X)
		ys = np.concatenate(all_y)
		ws = np.concatenate(all_w)
		ids = np.concatenate(all_ids)

		perm = np.random.permutation(Xs.shape[0])
		Xs = Xs[perm]
		ys = ys[perm]
		ws = ws[perm]
		ids = ids[perm]

		return DiskDataset.from_numpy(Xs, ys, ws, ids, data_dir=data_dir)
		return DiskDataset.create_dataset(
		generator(), data_dir=data_dir, tasks=self.get_task_names())

		def shuffle_each_shard(self,
		shard_basenames: Optional[List[str]] = None) -> None:
		@@ -2057,16 +2099,27 @@ class DiskDataset(Dataset):
		self._cached_shards = None

		def select(self, indices: Sequence[int],
		select_dir: str = None) -> "DiskDataset":
		select_dir: Optional[str] = None) -> "DiskDataset":
		"""Creates a new dataset from a selection of indices from self.

		Note
		----
		The specified indices will be returned in sorted order. That is, if you
		request that indices `[3, 1, 2]` are returned, you will get a
		`DiskDataset` which contains elements in order `[1, 2, 3]`.

		Parameters
		----------
		indices: list
		List of indices to select.
		select_dir: string
		select_dir: Optional[str], (default None)
		Path to new directory that the selected indices will be copied
		to.

		Returns
		-------
		DiskDataset
		Contains selected indices.
		"""
		if select_dir is not None:
		if not os.path.exists(select_dir):

deepchem/data/tests/test_shuffle.py

+45 −8

Original line number	Diff line number	Diff line
		"""
		Testing singletask/multitask dataset shuffling
		"""
		__author__ = "Bharath Ramsundar"
		__copyright__ = "Copyright 2016, Stanford University"
		__license__ = "MIT"

		import os
		import shutil
		import tempfile
		@@ -13,6 +9,47 @@ import deepchem as dc
		import numpy as np


		def test_complete_shuffle_one_shard():
		"""Test that complete shuffle works with only one shard."""
		X = np.random.rand(10, 10)
		dataset = dc.data.DiskDataset.from_numpy(X)
		shuffled = dataset.complete_shuffle()
		assert len(shuffled) == len(dataset)
		assert not np.array_equal(shuffled.ids, dataset.ids)
		assert sorted(shuffled.ids) == sorted(dataset.ids)
		assert shuffled.X.shape == dataset.X.shape
		assert shuffled.y.shape == dataset.y.shape
		assert shuffled.w.shape == dataset.w.shape


		def test_complete_shuffle_multiple_shard():
		"""Test that complete shuffle works with multiple shards."""
		X = np.random.rand(100, 10)
		dataset = dc.data.DiskDataset.from_numpy(X)
		dataset.reshard(shard_size=10)
		shuffled = dataset.complete_shuffle()
		assert len(shuffled) == len(dataset)
		assert not np.array_equal(shuffled.ids, dataset.ids)
		assert sorted(shuffled.ids) == sorted(dataset.ids)
		assert shuffled.X.shape == dataset.X.shape
		assert shuffled.y.shape == dataset.y.shape
		assert shuffled.w.shape == dataset.w.shape


		def test_complete_shuffle_multiple_shard_uneven():
		"""Test that complete shuffle works with multiple shards and some shards not full size."""
		X = np.random.rand(57, 10)
		dataset = dc.data.DiskDataset.from_numpy(X)
		dataset.reshard(shard_size=10)
		shuffled = dataset.complete_shuffle()
		assert len(shuffled) == len(dataset)
		assert not np.array_equal(shuffled.ids, dataset.ids)
		assert sorted(shuffled.ids) == sorted(dataset.ids)
		assert shuffled.X.shape == dataset.X.shape
		assert shuffled.y.shape == dataset.y.shape
		assert shuffled.w.shape == dataset.w.shape


		def test_complete_shuffle():
		"""Test that complete shuffle."""
		current_dir = os.path.dirname(os.path.realpath(__file__))
		@@ -22,8 +59,8 @@ def test_complete_shuffle():
		featurizer = dc.feat.CircularFingerprint(size=1024)
		tasks = ["log-solubility"]
		loader = dc.data.CSVLoader(
		tasks=tasks, smiles_field="smiles", featurizer=featurizer)
		dataset = loader.featurize(dataset_file, shard_size=2)
		tasks=tasks, feature_field="smiles", featurizer=featurizer)
		dataset = loader.create_dataset(dataset_file, shard_size=2)

		X_orig, y_orig, w_orig, orig_ids = (dataset.X, dataset.y, dataset.w,
		dataset.ids)
		@@ -52,8 +89,8 @@ def test_sparse_shuffle():
		featurizer = dc.feat.CircularFingerprint(size=1024)
		tasks = ["log-solubility"]
		loader = dc.data.CSVLoader(
		tasks=tasks, smiles_field="smiles", featurizer=featurizer)
		dataset = loader.featurize(dataset_file, shard_size=2)
		tasks=tasks, feature_field="smiles", featurizer=featurizer)
		dataset = loader.create_dataset(dataset_file, shard_size=2)

		X_orig, y_orig, w_orig, orig_ids = (dataset.X, dataset.y, dataset.w,
		dataset.ids)

deepchem/feat/graph_data.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -119,7 +119,7 @@ class GraphData:
		Returns
		-------
		dgl.DGLGraph
		Graph data for PyTorch Geometric
		Graph data for DGL

		Notes
		-----

deepchem/hyper/grid_search.py

+5 −1

Original line number	Diff line number	Diff line
		@@ -10,7 +10,7 @@ import collections
		import logging
		from functools import reduce
		from operator import mul
		from typing import Dict, List, Optional
		from typing import cast, Dict, List, Optional

		from deepchem.data import Dataset
		from deepchem.trans import Transformer
		@@ -155,6 +155,8 @@ class GridHyperparamOpt(HyperparamOpt):

		evaluator = Evaluator(model, valid_dataset, output_transformers)
		multitask_scores = evaluator.compute_model_performance([metric])
		# NOTE: this casting is workaround. This line doesn't effect anything to the runtime
		multitask_scores = cast(Dict[str, float], multitask_scores)
		valid_score = multitask_scores[metric.name]
		hp_str = _convert_hyperparam_dict_to_filename(hyper_params)
		all_scores[hp_str] = valid_score
		@@ -180,6 +182,8 @@ class GridHyperparamOpt(HyperparamOpt):
		return best_model, best_hyperparams, all_scores
		train_evaluator = Evaluator(best_model, train_dataset, output_transformers)
		multitask_scores = train_evaluator.compute_model_performance([metric])
		# NOTE: this casting is workaround. This line doesn't effect anything to the runtime
		multitask_scores = cast(Dict[str, float], multitask_scores)
		train_score = multitask_scores[metric.name]
		logger.info("Best hyperparameters: %s" % str(best_hyperparams))
		logger.info("train_score: %f" % train_score)

Admin message