Merge remote-tracking branch 'remotes/origin/master' (2c0a5566) · Commits · 钟慕尧 / deepchem

.travis.yml

+0 −1

Original line number	Diff line number	Diff line
		@@ -25,4 +25,3 @@ script:
		after_success:
		- echo $TRAVIS_SECURE_ENV_VARS
		- coveralls
		- source devtools/travis-ci/after_sucess.sh

Dockerfile

+2 −3

Original line number	Diff line number	Diff line
		@@ -2,7 +2,7 @@ FROM nvidia/cuda

		# Install some utilities
		RUN apt-get update && \
		apt-get install -y -q wget git libxrender1 && \
		apt-get install -y -q wget git libxrender1 libsm6 && \
		apt-get clean

		# Install miniconda
		@@ -21,10 +21,9 @@ ENV PATH /miniconda/bin:$PATH
		# TODO: Get rid of this when there is a stable release of deepchem.
		RUN git clone https://github.com/deepchem/deepchem.git && \
		cd deepchem && \
		git checkout tags/1.2.0 && \
		git checkout tags/1.3.0 && \
		sed -i -- 's/tensorflow$/tensorflow-gpu/g' scripts/install_deepchem_conda.sh && \
		bash scripts/install_deepchem_conda.sh root && \
		pip install tensorflow-gpu==1.0.1 && \
		python setup.py develop

		# Clean up

README.md

+1 −1

Original line number	Diff line number	Diff line
		@@ -68,7 +68,7 @@ via this installation procedure.

		### Easy Install via Conda
		```bash
		conda install -c deepchem -c rdkit -c conda-forge -c omnia deepchem=1.2.0
		conda install -c deepchem -c rdkit -c conda-forge -c omnia deepchem=1.3.0
		```

		### Installing Dependencies Manually

deepchem/data/datasets.py

+15 −11

Original line number	Diff line number	Diff line
		@@ -324,10 +324,12 @@ class NumpyDataset(Dataset):
		sample_perm = np.arange(n_samples)
		if batch_size is None:
		batch_size = n_samples
		interval_points = np.linspace(
		0, n_samples, np.ceil(float(n_samples) / batch_size) + 1, dtype=int)
		for j in range(len(interval_points) - 1):
		indices = range(interval_points[j], interval_points[j + 1])
		batch_idx = 0
		num_batches = np.math.ceil(n_samples / batch_size)
		while batch_idx < num_batches:
		start = batch_idx * batch_size
		end = min(n_samples, (batch_idx + 1) * batch_size)
		indices = range(start, end)
		perm_indices = sample_perm[indices]
		X_batch = dataset._X[perm_indices]
		y_batch = dataset._y[perm_indices]
		@@ -336,6 +338,7 @@ class NumpyDataset(Dataset):
		if pad_batches:
		(X_batch, y_batch, w_batch, ids_batch) = pad_batch(
		batch_size, X_batch, y_batch, w_batch, ids_batch)
		batch_idx += 1
		yield (X_batch, y_batch, w_batch, ids_batch)

		return iterate(self, batch_size, deterministic, pad_batches)
		@@ -663,13 +666,13 @@ class DiskDataset(Dataset):
		shard_batch_size = n_samples
		else:
		shard_batch_size = batch_size
		interval_points = np.linspace(
		0,
		n_samples,
		np.ceil(float(n_samples) / shard_batch_size) + 1,
		dtype=int)
		for j in range(len(interval_points) - 1):
		indices = range(interval_points[j], interval_points[j + 1])

		batch_idx = 0
		num_batches = np.math.ceil(n_samples / shard_batch_size)
		while batch_idx < num_batches:
		start = batch_idx * shard_batch_size
		end = min(n_samples, (batch_idx + 1) * shard_batch_size)
		indices = range(start, end)
		perm_indices = sample_perm[indices]
		X_batch = X[perm_indices]

		@@ -687,6 +690,7 @@ class DiskDataset(Dataset):
		if pad_batches:
		(X_batch, y_batch, w_batch, ids_batch) = pad_batch(
		shard_batch_size, X_batch, y_batch, w_batch, ids_batch)
		batch_idx += 1
		yield (X_batch, y_batch, w_batch, ids_batch)
		pool.close()

deepchem/data/tests/test_datasets.py

+23 −1

Original line number	Diff line number	Diff line
		@@ -384,3 +384,25 @@ class TestDatasets(unittest.TestCase):
		np.testing.assert_allclose(comp_y_means, y_means)
		np.testing.assert_allclose(comp_X_stds, X_stds)
		np.testing.assert_allclose(comp_y_stds, y_stds)

		def test_disk_iterate_batch_size(self):
		solubility_dataset = dc.data.tests.load_solubility_data()
		X, y, _, _ = (solubility_dataset.X, solubility_dataset.y,
		solubility_dataset.w, solubility_dataset.ids)
		batch_sizes = []
		for X, y, _, _ in solubility_dataset.iterbatches(
		3, pad_batches=False, deterministic=True):
		batch_sizes.append(len(X))
		self.assertEqual([3, 3, 3, 1], batch_sizes)

		def test_numpy_iterate_batch_size(self):
		solubility_dataset = dc.data.tests.load_solubility_data()
		X, y, _, _ = (solubility_dataset.X, solubility_dataset.y,
		solubility_dataset.w, solubility_dataset.ids)
		solubility_dataset = dc.data.NumpyDataset.from_DiskDataset(
		solubility_dataset)
		batch_sizes = []
		for X, y, _, _ in solubility_dataset.iterbatches(
		3, pad_batches=False, deterministic=True):
		batch_sizes.append(len(X))
		self.assertEqual([3, 3, 3, 1], batch_sizes)

Admin message