Merge pull request #2082 from nd-02110114/fix-windows-ci (37e61ee9) · Commits · 钟慕尧 / deepchem

deepchem/data/datasets.py

+57 −106

Original line number	Diff line number	Diff line
		@@ -9,18 +9,16 @@ import numpy as np
		import pandas as pd
		import random
		import logging
		from pandas import read_hdf
		import tempfile
		import time
		import shutil
		import json
		import warnings
		import multiprocessing
		from deepchem.utils.save import save_to_disk
		from deepchem.utils.save import load_from_disk
		from ast import literal_eval as make_tuple

		from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple, Union
		from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Tuple, Union
		from deepchem.utils.typing import OneOrMany, Shape

		Batch = Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]
		@@ -516,6 +514,7 @@ class Dataset(object):

		Returns
		-------
		torch.utils.data.IterableDataset
		`torch.utils.data.IterableDataset` that iterates over the data in
		this dataset.
		"""
		@@ -870,34 +869,21 @@ class NumpyDataset(Dataset):
		deterministic: bool
		if True, the data is produced in order. If False, a different random
		permutation of the data is used for each epoch.
		"""
		import torch

		def iterate():
		n_samples = self._X.shape[0]
		worker_info = torch.utils.data.get_worker_info()
		if worker_info is None:
		first_sample = 0
		last_sample = n_samples
		else:
		first_sample = worker_info.id * n_samples // worker_info.num_workers
		last_sample = (
		worker_info.id + 1) * n_samples // worker_info.num_workers
		for epoch in range(epochs):
		if deterministic:
		order = first_sample + np.arange(last_sample - first_sample)
		else:
		order = first_sample + np.random.permutation(last_sample -
		first_sample)
		for i in order:
		yield (self._X[i], self._y[i], self._w[i], self._ids[i])

		class TorchDataset(torch.utils.data.IterableDataset): # type: ignore

		def __iter__(self):
		return iterate()
		Returns
		-------
		torch.utils.data.IterableDataset
		`torch.utils.data.IterableDataset` that iterates over the data in
		this dataset.
		"""
		try:
		from deepchem.data.pytorch_datasets import _TorchNumpyDataset
		except:
		raise ValueError("This method requires PyTorch to be installed.")

		return TorchDataset()
		pytorch_ds = _TorchNumpyDataset(
		numpy_dataset=self, epochs=epochs, deterministic=deterministic)
		return pytorch_ds

		@staticmethod
		def from_DiskDataset(ds: "DiskDataset") -> "NumpyDataset":
		@@ -958,6 +944,15 @@ class NumpyDataset(Dataset):
		return NumpyDataset(X, y, w, ids, n_tasks=y.shape[1])


		class _Shard(object):

		def __init__(self, X, y, w, ids):
		self.X = X
		self.y = y
		self.w = w
		self.ids = ids


		class DiskDataset(Dataset):
		"""
		A Dataset that is stored as a set of files on disk.
		@@ -1135,7 +1130,7 @@ class DiskDataset(Dataset):
		metadata_df = pd.read_csv(metadata_filename, compression='gzip')
		metadata_df = metadata_df.where((pd.notnull(metadata_df)), None)
		return tasks, metadata_df
		except Exception as e:
		except Exception:
		pass

		# Load obsolete format -> save in new format
		@@ -1482,8 +1477,8 @@ class DiskDataset(Dataset):
		# than process based pools, since process based pools need to pickle/serialize
		# objects as an extra overhead. Also, as hideously as un-thread safe this looks,
		# we're actually protected by the GIL.
		pool = multiprocessing.dummy.Pool(
		1) # mp.dummy aliases ThreadPool to Pool
		# mp.dummy aliases ThreadPool to Pool
		pool = multiprocessing.dummy.Pool(1)

		if batch_size is None:
		num_global_batches = num_shards
		@@ -1703,33 +1698,21 @@ class DiskDataset(Dataset):
		deterministic: bool
		if True, the data is produced in order. If False, a different random
		permutation of the data is used for each epoch.
		"""
		import torch

		def iterate():
		worker_info = torch.utils.data.get_worker_info()
		n_shards = self.get_number_shards()
		if worker_info is None:
		first_shard = 0
		last_shard = n_shards
		else:
		first_shard = worker_info.id * n_shards // worker_info.num_workers
		last_shard = (worker_info.id + 1) * n_shards // worker_info.num_workers
		if first_shard == last_shard:
		return
		shard_indices = list(range(first_shard, last_shard))
		for epoch in range(epochs):
		for X, y, w, ids in self._iterbatches_from_shards(
		shard_indices, deterministic=deterministic):
		for i in range(X.shape[0]):
		yield (X[i], y[i], w[i], ids[i])

		class TorchDataset(torch.utils.data.IterableDataset): # type: ignore

		def __iter__(self):
		return iterate()
		Returns
		-------
		torch.utils.data.IterableDataset
		`torch.utils.data.IterableDataset` that iterates over the data in
		this dataset.
		"""
		try:
		from deepchem.data.pytorch_datasets import _TorchDiskDataset
		except:
		raise ValueError("This method requires PyTorch to be installed.")

		return TorchDataset()
		pytorch_ds = _TorchDiskDataset(
		disk_dataset=self, epochs=epochs, deterministic=deterministic)
		return pytorch_ds

		@staticmethod
		def from_numpy(X: np.ndarray,
		@@ -1962,14 +1945,6 @@ class DiskDataset(Dataset):
		def get_shard(self, i: int) -> Batch:
		"""Retrieves data for the i-th shard from disk."""

		class Shard(object):

		def __init__(self, X, y, w, ids):
		self.X = X
		self.y = y
		self.w = w
		self.ids = ids

		# See if we have a cached copy of this shard.
		if self._cached_shards is None:
		self._cached_shards = [None] * self.get_number_shards()
		@@ -2010,7 +1985,7 @@ class DiskDataset(Dataset):
		# shard again before the next time we want this one. So just cache as many
		# as we can and then stop.

		shard = Shard(X, y, w, ids)
		shard = _Shard(X, y, w, ids)
		shard_size = X.nbytes + ids.nbytes
		if y is not None:
		shard_size += y.nbytes
		@@ -2526,42 +2501,18 @@ class ImageDataset(Dataset):

		Returns
		-------
		`torch.utils.data.IterableDataset` iterating over the same data as
		torch.utils.data.IterableDataset
		`torch.utils.data.IterableDataset` that iterates over the data in
		this dataset.
		"""
		import torch

		def get_image(array, index):
		if isinstance(array, np.ndarray):
		return array[index]
		return dc.data.ImageLoader.load_img([array[index]])[0]

		def iterate():
		n_samples = self._X_shape[0]
		worker_info = torch.utils.data.get_worker_info()
		if worker_info is None:
		first_sample = 0
		last_sample = n_samples
		else:
		first_sample = worker_info.id * n_samples // worker_info.num_workers
		last_sample = (
		worker_info.id + 1) * n_samples // worker_info.num_workers
		for epoch in range(epochs):
		if deterministic:
		order = first_sample + np.arange(last_sample - first_sample)
		else:
		order = first_sample + np.random.permutation(last_sample -
		first_sample)
		for i in order:
		yield (get_image(self._X, i), get_image(self._y, i), self._w[i],
		self._ids[i])

		class TorchDataset(torch.utils.data.IterableDataset): # type: ignore

		def __iter__(self):
		return iterate()
		try:
		from deepchem.data.pytorch_datasets import _TorchImageDataset
		except:
		raise ValueError("This method requires PyTorch to be installed.")

		return TorchDataset()
		pytorch_ds = _TorchImageDataset(
		image_dataset=self, epochs=epochs, deterministic=deterministic)
		return pytorch_ds


		class Databag(object):

deepchem/data/pytorch_datasets.py

0 → 100644

+142 −0

Original line number	Diff line number	Diff line
		from typing import List, Union
		import numpy as np
		import torch

		from deepchem.data.data_loader import ImageLoader
		from deepchem.data.datasets import NumpyDataset, DiskDataset, ImageDataset


		class _TorchNumpyDataset(torch.utils.data.IterableDataset): # type: ignore

		def __init__(self, numpy_dataset: NumpyDataset, epochs: int,
		deterministic: bool):
		"""
		Parameters
		----------
		numpy_dataset: NumpyDataset
		The original NumpyDataset which you want to convert to PyTorch Dataset
		epochs: int
		the number of times to iterate over the Dataset
		deterministic: bool
		if True, the data is produced in order. If False, a different random
		permutation of the data is used for each epoch.
		"""
		self.numpy_dataset = numpy_dataset
		self.epochs = epochs
		self.deterministic = deterministic

		def __iter__(self):
		n_samples = self.numpy_dataset._X.shape[0]
		worker_info = torch.utils.data.get_worker_info()
		if worker_info is None:
		first_sample = 0
		last_sample = n_samples
		else:
		first_sample = worker_info.id * n_samples // worker_info.num_workers
		last_sample = (worker_info.id + 1) * n_samples // worker_info.num_workers
		for epoch in range(self.epochs):
		if self.deterministic:
		order = first_sample + np.arange(last_sample - first_sample)
		else:
		order = first_sample + np.random.permutation(last_sample - first_sample)
		for i in order:
		yield (self.numpy_dataset._X[i], self.numpy_dataset._y[i],
		self.numpy_dataset._w[i], self.numpy_dataset._ids[i])


		class _TorchDiskDataset(torch.utils.data.IterableDataset): # type: ignore

		def __init__(self, disk_dataset: DiskDataset, epochs: int,
		deterministic: bool):
		"""
		Parameters
		----------
		disk_dataset: DiskDataset
		The original DiskDataset which you want to convert to PyTorch Dataset
		epochs: int
		the number of times to iterate over the Dataset
		deterministic: bool
		if True, the data is produced in order. If False, a different random
		permutation of the data is used for each epoch.
		"""
		self.disk_dataset = disk_dataset
		self.epochs = epochs
		self.deterministic = deterministic

		def __iter__(self):
		worker_info = torch.utils.data.get_worker_info()
		n_shards = self.disk_dataset.get_number_shards()
		if worker_info is None:
		first_shard = 0
		last_shard = n_shards
		else:
		first_shard = worker_info.id * n_shards // worker_info.num_workers
		last_shard = (worker_info.id + 1) * n_shards // worker_info.num_workers
		if first_shard == last_shard:
		return

		shard_indices = list(range(first_shard, last_shard))
		for epoch in range(self.epochs):
		for X, y, w, ids in self.disk_dataset._iterbatches_from_shards(
		shard_indices, deterministic=self.deterministic):
		for i in range(X.shape[0]):
		yield (X[i], y[i], w[i], ids[i])


		class _TorchImageDataset(torch.utils.data.IterableDataset): # type: ignore

		def __init__(self, image_dataset: ImageDataset, epochs: int,
		deterministic: bool):
		"""
		Parameters
		----------
		image_dataset: ImageDataset
		The original ImageDataset which you want to convert to PyTorch Dataset
		epochs: int
		the number of times to iterate over the Dataset
		deterministic: bool
		if True, the data is produced in order. If False, a different random
		permutation of the data is used for each epoch.
		"""
		self.image_dataset = image_dataset
		self.epochs = epochs
		self.deterministic = deterministic

		def __iter__(self):
		n_samples = self.image_dataset._X_shape[0]
		worker_info = torch.utils.data.get_worker_info()
		if worker_info is None:
		first_sample = 0
		last_sample = n_samples
		else:
		first_sample = worker_info.id * n_samples // worker_info.num_workers
		last_sample = (worker_info.id + 1) * n_samples // worker_info.num_workers
		for epoch in range(self.epochs):
		if self.deterministic:
		order = first_sample + np.arange(last_sample - first_sample)
		else:
		order = first_sample + np.random.permutation(last_sample - first_sample)
		for i in order:
		yield (self._get_image(self.image_dataset._X, i),
		self._get_image(self.image_dataset._y, i),
		self.image_dataset._w[i], self.image_dataset._ids[i])

		def _get_image(self, array: Union[np.ndarray, List[str]],
		index: int) -> np.ndarray:
		"""Method for loading an image

		Parameters
		----------
		array: Union[np.ndarray, List[str]]
		A numpy array which contains images or List of image filenames
		index: int
		Index you want to get the image

		Returns
		-------
		np.ndarray
		Loaded image
		"""
		if isinstance(array, np.ndarray):
		return array[index]
		return ImageLoader.load_img([array[index]])[0]

deepchem/data/tests/test_datasets.py

+2 −10

Original line number	Diff line number	Diff line
		@@ -4,14 +4,9 @@ Tests for dataset creation
		import random
		import math
		import unittest
		import tempfile
		import os
		import shutil
		import numpy as np
		import deepchem as dc
		import tensorflow as tf
		import pandas as pd
		from tensorflow.python.framework import test_util

		try:
		import torch
		@@ -25,7 +20,6 @@ def load_solubility_data():
		current_dir = os.path.dirname(os.path.abspath(__file__))
		featurizer = dc.feat.CircularFingerprint(size=1024)
		tasks = ["log-solubility"]
		task_type = "regression"
		input_file = os.path.join(current_dir, "../../models/tests/example.csv")
		loader = dc.data.CSVLoader(
		tasks=tasks, smiles_field="smiles", featurizer=featurizer)
		@@ -107,7 +101,6 @@ def test_pad_features():
		"""Test that pad_features pads features correctly."""
		batch_size = 100
		num_features = 10
		num_tasks = 5

		# Test cases where n_samples < 2*n_samples < batch_size
		n_samples = 29
		@@ -302,7 +295,6 @@ def test_select():

		def test_complete_shuffle():
		shard_sizes = [1, 2, 3, 4, 5]
		batch_size = 10

		all_Xs, all_ys, all_ws, all_ids = [], [], [], []

		@@ -526,7 +518,7 @@ def test_disk_iterate_y_w_None():
		shard_sizes = [21, 11, 41, 21, 51]
		batch_size = 10

		all_Xs, all_ys, all_ws, all_ids = [], [], [], []
		all_Xs, all_ids = [], []

		def shard_generator():
		for sz in shard_sizes:
		@@ -815,7 +807,7 @@ def test_to_str():
		assert str(dataset) == ref_str


		class TestDatasets(test_util.TensorFlowTestCase):
		class TestDatasets(unittest.TestCase):
		"""
		Test basic top-level API for dataset objects.
		"""

deepchem/models/tests/test_generalize.py

+2 −5

Original line number	Diff line number	Diff line
		@@ -5,10 +5,7 @@ Tests to make sure deepchem models can fit models on easy datasets.
		import sklearn
		import sklearn.datasets
		import numpy as np
		import unittest
		import tempfile
		import deepchem as dc
		from sklearn.ensemble import RandomForestRegressor
		from sklearn.linear_model import LinearRegression
		from sklearn.linear_model import LogisticRegression

		@@ -180,7 +177,7 @@ def test_sklearn_multitask_classification():
		model.save()
		# Eval model on test
		scores = model.evaluate(test_dataset, [classification_metric])
		assert scores['roc_auc_score'] > 0.5
		assert scores[classification_metric.name] > .5


		def test_xgboost_regression():
		@@ -247,7 +244,7 @@ def test_xgboost_multitask_regression():
		# Eval model on test
		scores = model.evaluate(test_dataset, [regression_metric])
		score = scores[regression_metric.name]
		assert score < 50
		assert score < 55


		def test_xgboost_classification():

Admin message