Merge branch 'master' into material_featurizer_renames (bebb5562) · Commits · 钟慕尧 / deepchem

deepchem/data/init.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -18,3 +18,4 @@ from deepchem.data.data_loader import JsonLoader
		from deepchem.data.data_loader import SDFLoader
		from deepchem.data.data_loader import FASTALoader
		from deepchem.data.data_loader import ImageLoader
		from deepchem.data.data_loader import InMemoryLoader

deepchem/data/data_loader.py

+311 −47

Original line number	Diff line number	Diff line
		@@ -12,13 +12,14 @@ import time
		import sys
		import logging
		import warnings
		from typing import List, Optional, Dict, Tuple
		from typing import List, Optional, Dict, Tuple, Any, Sequence, Union

		from deepchem.utils.typing import OneOrMany
		from deepchem.utils.save import load_csv_files, load_json_files
		from deepchem.utils.save import load_sdf_files
		from deepchem.utils.genomics import encode_fasta_sequence
		from deepchem.feat import UserDefinedFeaturizer, Featurizer
		from deepchem.data import DiskDataset, NumpyDataset, ImageDataset
		from deepchem.data import Dataset, DiskDataset, NumpyDataset, ImageDataset
		import zipfile

		logger = logging.getLogger(__name__)
		@@ -85,6 +86,10 @@ def _featurize_smiles_df(df, featurizer, field, log_every_n=1000):
		The name of a column in `df` that holds SMILES strings
		log_every_n: int, optional (default 1000)
		Emit a logging statement every `log_every_n` rows.

		Note
		----
		This function requires RDKit to be installed
		"""
		sample_elems = df[field].tolist()

		@@ -238,7 +243,10 @@ class DataLoader(object):
		self.featurizer = featurizer
		self.log_every_n = log_every_n

		def featurize(self, input_files, data_dir=None, shard_size=8192):
		def featurize(self,
		inputs: Sequence[Any],
		data_dir: Optional[str] = None,
		shard_size: Optional[int] = 8192) -> Dataset:
		"""Featurize provided files and write to specified location.

		DEPRECATED: This method is now a wrapper for `create_dataset()`
		@@ -253,8 +261,8 @@ class DataLoader(object):

		Parameters
		----------
		input_files: list
		List of input filenames.
		inputs: list
		List of inputs to process. Entries can be filenames or arbitrary objects.
		data_dir: str, optional
		Directory to store featurized dataset.
		shard_size: int, optional
		@@ -263,18 +271,21 @@ class DataLoader(object):
		Returns
		-------
		A `Dataset` object containing a featurized representation of data
		from `input_files`.
		from `input`.
		"""
		warnings.warn(
		"featurize() is deprecated and has been renamed to create_dataset(). featurize() will be removed in DeepChem 3.0",
		FutureWarning)
		return self.create_dataset(input_files, data_dir, shard_size)
		return self.create_dataset(inputs, data_dir, shard_size)

		def create_dataset(self, input_files, data_dir=None, shard_size=8192):
		def create_dataset(self,
		inputs: Sequence[Any],
		data_dir: Optional[str] = None,
		shard_size: Optional[int] = 8192) -> Dataset:
		"""Creates and returns a `Dataset` object by featurizing provided files.

		Reads in `input_files` and uses `self.featurizer` to featurize the
		data in these input files. For large files, automatically shards
		Reads in `inputs` and uses `self.featurizer` to featurize the
		data in these inputs. For large files, automatically shards
		into smaller chunks of `shard_size` datapoints for convenience.
		Returns a `Dataset` object that contains the featurized dataset.

		@@ -285,8 +296,8 @@ class DataLoader(object):

		Parameters
		----------
		input_files: list
		List of input filenames.
		inputs: list
		List of inputs to process. Entries can be filenames or arbitrary objects.
		data_dir: str, optional
		Directory to store featurized dataset.
		shard_size: int, optional
		@@ -295,17 +306,16 @@ class DataLoader(object):
		Returns
		-------
		A `Dataset` object containing a featurized representation of data
		from `input_files`.
		from `inputs`.
		"""
		logger.info("Loading raw samples now.")
		logger.info("shard_size: %d" % shard_size)
		logger.info("shard_size: %s" % str(shard_size))

		if not isinstance(input_files, list):
		input_files = [input_files]
		if not isinstance(inputs, list):
		inputs = [inputs]

		def shard_generator():
		for shard_num, shard in enumerate(
		self._get_shards(input_files, shard_size)):
		for shard_num, shard in enumerate(self._get_shards(inputs, shard_size)):
		time1 = time.time()
		X, valid_inds = self._featurize_shard(shard)
		ids = shard[self.id_field].values
		@@ -329,11 +339,11 @@ class DataLoader(object):

		return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)

		def _get_shards(self, input_files, shard_size):
		def _get_shards(self, inputs, shard_size):
		"""Stub for children classes.

		Should implement a generator that walks over the source data in
		`input_files` and returns a "shard" at a time. Here a shard is a
		`inputs` and returns a "shard" at a time. Here a shard is a
		chunk of input data that can reasonably be handled in memory. For
		example, this may be a set of rows from a CSV file or a set of
		molecules from a SDF file. To re-use the
		@@ -345,8 +355,8 @@ class DataLoader(object):

		Parameters
		----------
		input_files: list
		List of input filenames.
		inputs: list
		List of inputs to process. Entries can be filenames or arbitrary objects.
		shard_size: int, optional
		Number of examples stored in each shard.
		"""
		@@ -411,7 +421,15 @@ class CSVLoader(DataLoader):
		self.log_every_n = log_every_n

		def _get_shards(self, input_files, shard_size):
		"""Defines a generator which returns data for each shard"""
		"""Defines a generator which returns data for each shard

		Parameters
		----------
		input_files: list[str]
		List of filenames to process
		shard_size: int
		The size of a shard of data to process at a time.
		"""
		return load_csv_files(input_files, shard_size)

		def _featurize_shard(self, shard):
		@@ -467,7 +485,7 @@ class JsonLoader(DataLoader):
		"""

		def __init__(self,
		tasks: List[str],
		tasks: OneOrMany[str],
		feature_field: str,
		label_field: str = None,
		weight_field: str = None,
		@@ -510,14 +528,14 @@ class JsonLoader(DataLoader):
		self.log_every_n = log_every_n

		def create_dataset(self,
		input_files: List[str],
		input_files: OneOrMany[str],
		data_dir: Optional[str] = None,
		shard_size: Optional[int] = 8192) -> DiskDataset:
		"""Creates a `Dataset` from input JSON files.

		Parameters
		----------
		input_files: List[str]
		input_files: OneOrMany[str]
		List of JSON filenames.
		data_dir: Optional[str], default None
		Name of directory where featurized data is stored.
		@@ -531,9 +549,16 @@ class JsonLoader(DataLoader):
		from `input_files`.

		"""

		if not isinstance(input_files, list):
		try:
		if isinstance(input_files, str):
		input_files = [input_files]
		else:
		input_files = list(input_files)
		except TypeError:
		raise ValueError(
		"input_files is of an unrecognized form. Must be one filename or a list of filenames."
		)

		def shard_generator():
		"""Yield X, y, w, and ids for shards."""
		@@ -685,7 +710,10 @@ class FASTALoader(DataLoader):
		"""Initialize loader."""
		pass

		def create_dataset(self, input_files, data_dir=None, shard_size=None):
		def create_dataset(self,
		input_files: OneOrMany[str],
		data_dir: Optional[str] = None,
		shard_size: Optional[int] = None) -> DiskDataset:
		"""Creates a `Dataset` from input FASTA files.

		At present, FASTA support is limited and only allows for one-hot
		@@ -706,7 +734,7 @@ class FASTALoader(DataLoader):
		A `Dataset` object containing a featurized representation of data
		from `input_files`.
		"""
		if not isinstance(input_files, list):
		if isinstance(input_files, str):
		input_files = [input_files]

		def shard_generator():
		@@ -728,7 +756,7 @@ class ImageLoader(DataLoader):
		traverse subdirectories which contain images.
		"""

		def __init__(self, tasks=None):
		def __init__(self, tasks: OneOrMany[str] = None):
		"""Initialize image loader.

		At present, custom image featurizers aren't supported by this
		@@ -744,22 +772,30 @@ class ImageLoader(DataLoader):
		self.tasks = tasks

		def create_dataset(self,
		input_files,
		labels=None,
		weights=None,
		in_memory=False):
		inputs: Union[OneOrMany[str], Tuple[Any]],
		data_dir: Optional[str] = None,
		shard_size: Optional[int] = 8192,
		in_memory: bool = False) -> Dataset:
		"""Creates and returns a `Dataset` object by featurizing provided image files and labels/weights.

		Parameters
		----------
		input_files: list
		Each file in this list should either be of a supported
		image format (.png, .tif only for now) or of a compressed
		folder of image files (only .zip for now).
		labels: optional
		If provided, a numpy ndarray of image labels
		weights: optional
		If provided, a numpy ndarray of image weights
		inputs: `Union[OneOrMany[str], Tuple[Any]]`
		The inputs provided should be one of the following

		- filename
		- list of filenames
		- Tuple (list of filenames, labels)
		- Tuple (list of filenames, labels, weights)

		Each file in a given list of filenames should either be of a supported
		image format (.png, .tif only for now) or of a compressed folder of
		image files (only .zip for now). If `labels` or `weights` are provided,
		they must correspond to the sorted order of all filenames provided, with
		one label/weight per file.

		data_dir: str, optional
		Directory to store featurized dataset.
		in_memory: bool
		If true, return in-memory NumpyDataset. Else return ImageDataset.

		@@ -767,8 +803,23 @@ class ImageLoader(DataLoader):
		-------
		A `Dataset` object containing a featurized representation of data
		from `input_files`, `labels`, and `weights`.

		"""
		if not isinstance(input_files, list):
		labels, weights = None, None
		if isinstance(inputs, tuple):
		if len(inputs) == 1:
		input_files = inputs[0]
		if isinstance(inputs, str):
		input_files = [inputs]
		elif len(inputs) == 2:
		input_files, labels = inputs
		elif len(inputs) == 3:
		input_files, labels, weights = inputs
		else:
		raise ValueError("Input must be a tuple of length 1, 2, or 3")
		else:
		input_files = inputs
		if isinstance(input_files, str):
		input_files = [input_files]

		image_files = []
		@@ -804,14 +855,44 @@ class ImageLoader(DataLoader):
		raise ValueError("Unsupported file format")
		input_files = remainder

		# Sort image files
		image_files = sorted(image_files)

		if in_memory:
		if data_dir is None:
		return NumpyDataset(
		self.load_img(image_files), y=labels, w=weights, ids=image_files)
		else:
		dataset = DiskDataset.from_numpy(
		self.load_img(image_files),
		y=labels,
		w=weights,
		ids=image_files,
		tasks=self.tasks,
		data_dir=data_dir)
		if shard_size is not None:
		dataset.reshard(shard_size)
		return dataset
		else:
		return ImageDataset(image_files, y=labels, w=weights, ids=image_files)

		@staticmethod
		def load_img(image_files):
		def load_img(image_files) -> np.ndarray:
		"""Loads a set of images from disk.

		Parameters
		----------
		image_files: list[str]
		List of image filenames to load

		Returns
		-------
		np.ndarray that contains loaded images. Of shape `(N,...)`.

		Note
		----
		This method requires PIL to be installed.
		"""
		from PIL import Image
		images = []
		for image_file in image_files:
		@@ -827,3 +908,186 @@ class ImageLoader(DataLoader):
		else:
		raise ValueError("Unsupported image filetype for %s" % image_file)
		return np.array(images)


		class InMemoryLoader(DataLoader):
		"""Facilitate Featurization of In-memory objects.

		When featurizing a dataset, it's often the case that the initial set of
		data (pre-featurization) fits handily within memory. (For example, perhaps
		it fits within a column of a pandas DataFrame.) In this case, it would be
		convenient to directly be able to featurize this column of data. However,
		the process of featurization often generates large arrays which quickly eat
		up available memory. This class provides convenient capabilities to process
		such in-memory data by checkpointing generated features periodically to
		disk.

		Example
		-------
		Here's an example with only datapoints and no labels or weights.

		>>> import deepchem as dc
		>>> smiles = ["C", "CC", "CCC", "CCCC"]
		>>> featurizer = dc.feat.CircularFingerprint()
		>>> loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
		>>> dataset = loader.create_dataset(smiles, shard_size=2)
		>>> len(dataset)
		4

		Here's an example with both datapoints and labels

		>>> import deepchem as dc
		>>> smiles = ["C", "CC", "CCC", "CCCC"]
		>>> labels = [1, 0, 1, 0]
		>>> featurizer = dc.feat.CircularFingerprint()
		>>> loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
		>>> dataset = loader.create_dataset(zip(smiles, labels), shard_size=2)
		>>> len(dataset)
		4

		Here's an example with datapoints, labels, weights and ids all provided.

		>>> import deepchem as dc
		>>> smiles = ["C", "CC", "CCC", "CCCC"]
		>>> labels = [1, 0, 1, 0]
		>>> weights = [1.5, 0, 1.5, 0]
		>>> ids = ["C", "CC", "CCC", "CCCC"]
		>>> featurizer = dc.feat.CircularFingerprint()
		>>> loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
		>>> dataset = loader.create_dataset(zip(smiles, labels, weights, ids), shard_size=2)
		>>> len(dataset)
		4

		"""

		def create_dataset(self,
		inputs: Sequence[Any],
		data_dir: Optional[str] = None,
		shard_size: Optional[int] = 8192) -> DiskDataset:
		"""Creates and returns a `Dataset` object by featurizing provided files.

		Reads in `inputs` and uses `self.featurizer` to featurize the
		data in these input files. For large files, automatically shards
		into smaller chunks of `shard_size` datapoints for convenience.
		Returns a `Dataset` object that contains the featurized dataset.

		This implementation assumes that the helper methods `_get_shards`
		and `_featurize_shard` are implemented and that each shard
		returned by `_get_shards` is a pandas dataframe. You may choose
		to reuse or override this method in your subclass implementations.

		Parameters
		----------
		inputs: Sequence[Any]
		List of inputs to process. Entries can be arbitrary objects so long as
		they are understood by `self.featurizer`
		data_dir: str, optional
		Directory to store featurized dataset.
		shard_size: int, optional
		Number of examples stored in each shard.

		Returns
		-------
		A `Dataset` object containing a featurized representation of data
		from `inputs`.
		"""
		logger.info("Loading raw samples now.")
		logger.info("shard_size: %s" % str(shard_size))

		if not isinstance(inputs, list):
		try:
		inputs = list(inputs)
		except TypeError:
		inputs = [inputs]

		def shard_generator():
		global_index = 0
		for shard_num, shard in enumerate(self._get_shards(inputs, shard_size)):
		time1 = time.time()
		X, y, w, ids = self._featurize_shard(shard, global_index)
		global_index += len(shard)

		time2 = time.time()
		logger.info("TIMING: featurizing shard %d took %0.3f s" %
		(shard_num, time2 - time1))
		yield X, y, w, ids

		return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)

		def _get_shards(self, inputs, shard_size):
		"""Break up input into shards.

		Parameters
		----------
		inputs: list[object]
		Each entry in this list must be of the form `(featurization_input,
		label, weight, id)` or `(featurization_input, label, weight)` or
		`(featurization_input, label)` or `featurization_input` for one
		datapoint, where `featurization_input` is any input that is recognized
		by `self.featurizer`.
		shard_size: int
		The size of shard to generate.

		Returns
		-------
		Iterator which iterates over shards of data.
		"""
		current_shard = []
		for i, datapoint in enumerate(inputs):
		if i != 0 and i % shard_size == 0:
		shard_data = current_shard
		current_shard = []
		yield shard_data
		current_shard.append(datapoint)
		yield current_shard

		def _featurize_shard(self, shard, global_index):
		"""Featurizes a shard of an input data.

		Parameters
		----------
		shard: list
		List each entry of which must be of the form `(featurization_input,
		label, weight, id)` or `(featurization_input, label, weight)` or
		`(featurization_input, label)` or `featurization_input` for one
		datapoint, where `featurization_input` is any input that is recognized
		by `self.featurizer`.
		global_index: int
		The starting index for this shard in the full set of provided inputs
		"""
		features = []
		labels = []
		weights = []
		ids = []
		n_tasks = len(self.tasks)
		for i, entry in enumerate(shard):
		if not isinstance(entry, tuple):
		entry = (entry,)
		if len(entry) > 4:
		raise ValueError(
		"Entry is malformed and must be of length 1-4 containing featurization_input and optionally label, weight, and id."
		)
		if len(entry) == 4:
		featurization_input, label, weight, entry_id = entry
		elif len(entry) == 3:
		featurization_input, label, weight = entry
		entry_id = global_index + i
		elif len(entry) == 2:
		featurization_input, label = entry
		weight = np.ones((n_tasks), np.float32)
		entry_id = global_index + i
		elif len(entry) == 1:
		featurization_input = entry
		label = np.zeros((n_tasks), np.float32)
		weight = np.zeros((n_tasks), np.float32)
		entry_id = global_index + i
		feature = self.featurizer(featurization_input)
		features.append(feature)
		weights.append(weight)
		labels.append(label)
		ids.append(entry_id)
		X = np.concatenate(features, axis=0)
		y = np.array(labels)
		w = np.array(weights)
		ids = np.array(ids)
		return X, y, w, ids

deepchem/data/datasets.py

+20 −8

Original line number	Diff line number	Diff line
		@@ -888,7 +888,7 @@ class NumpyDataset(Dataset):
		for i in order:
		yield (self._X[i], self._y[i], self._w[i], self._ids[i])

		class TorchDataset(torch.utils.data.IterableDataset):
		class TorchDataset(torch.utils.data.IterableDataset): # type: ignore

		def __iter__(self):
		return iterate()
		@@ -1090,15 +1090,14 @@ class DiskDataset(Dataset):
		Gets learning tasks associated with this dataset.
		"""
		return self.tasks
		# if not len(self.metadata_df):
		# raise ValueError("No data in dataset.")
		# return next(self.metadata_df.iterrows())[1]['task_names']

		def reshard(self, shard_size: int) -> None:
		"""Reshards data to have specified shard size."""
		# Create temp directory to store resharded version
		reshard_dir = tempfile.mkdtemp()

		n_shards = self.get_number_shards()

		# Write data in new shards
		def generator():
		tasks = self.get_task_names()
		@@ -1106,7 +1105,8 @@ class DiskDataset(Dataset):
		y_next = np.zeros((0,) + (len(tasks),))
		w_next = np.zeros((0,) + (len(tasks),))
		ids_next = np.zeros((0,), dtype=object)
		for (X, y, w, ids) in self.itershards():
		for shard_num, (X, y, w, ids) in enumerate(self.itershards()):
		logger.info("Resharding shard %d/%d" % (shard_num, n_shards))
		X_next = np.concatenate([X_next, X], axis=0)
		y_next = np.concatenate([y_next, y], axis=0)
		w_next = np.concatenate([w_next, w], axis=0)
		@@ -1366,8 +1366,11 @@ class DiskDataset(Dataset):
		out_dir = tempfile.mkdtemp()
		tasks = self.get_task_names()

		n_shards = self.get_number_shards()

		def generator():
		for shard_num, row in self.metadata_df.iterrows():
		logger.info("Transforming shard %d/%d" % (shard_num, n_shards))
		X, y, w, ids = self.get_shard(shard_num)
		newx, newy, neww = fn(X, y, w)
		yield (newx, newy, neww, ids)
		@@ -1409,7 +1412,7 @@ class DiskDataset(Dataset):
		for i in range(X.shape[0]):
		yield (X[i], y[i], w[i], ids[i])

		class TorchDataset(torch.utils.data.IterableDataset):
		class TorchDataset(torch.utils.data.IterableDataset): # type: ignore

		def __iter__(self):
		return iterate()
		@@ -1485,6 +1488,7 @@ class DiskDataset(Dataset):

		def generator():
		for ind, dataset in enumerate(datasets):
		logger.info("Merging in dataset %d/%d" % (ind, len(datasets)))
		X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
		yield (X, y, w, ids)

		@@ -1761,9 +1765,12 @@ class DiskDataset(Dataset):
		indices = np.array(sorted(indices)).astype(int)
		tasks = self.get_task_names()

		n_shards = self.get_number_shards()

		def generator():
		count, indices_count = 0, 0
		for shard_num, (X, y, w, ids) in enumerate(self.itershards()):
		logger.info("Selecting from shard %d/%d" % (shard_num, n_shards))
		shard_len = len(X)
		# Find indices which rest in this shard
		num_shard_elts = 0
		@@ -1936,7 +1943,12 @@ class ImageDataset(Dataset):
		self._X_shape = self._find_array_shape(X)
		self._y_shape = self._find_array_shape(y)
		if w is None:
		if len(self._y_shape) == 1:
		if len(self._y_shape) == 0:
		# Case n_samples should be 1
		if n_samples != 1:
		raise ValueError("y can only be a scalar if n_samples == 1")
		w = np.ones_like(y)
		elif len(self._y_shape) == 1:
		w = np.ones(self._y_shape[0], np.float32)
		else:
		w = np.ones((self._y_shape[0], 1), np.float32)
		@@ -2164,7 +2176,7 @@ class ImageDataset(Dataset):
		yield (get_image(self._X, i), get_image(self._y, i), self._w[i],
		self._ids[i])

		class TorchDataset(torch.utils.data.IterableDataset):
		class TorchDataset(torch.utils.data.IterableDataset): # type: ignore

		def __iter__(self):
		return iterate()

deepchem/data/tests/test_image_loader.py

+8 −0

Original line number	Diff line number	Diff line
		@@ -7,6 +7,7 @@ import tempfile
		from scipy import misc
		import deepchem as dc
		import zipfile
		import numpy as np


		class TestImageLoader(unittest.TestCase):
		@@ -62,6 +63,13 @@ class TestImageLoader(unittest.TestCase):
		# These are the known dimensions of face.png
		assert dataset.X.shape == (1, 768, 1024, 3)

		def test_png_simple_load_with_labels(self):
		loader = dc.data.ImageLoader()
		dataset = loader.featurize((self.face_path, np.array(1)))
		# These are the known dimensions of face.png
		assert dataset.X.shape == (1, 768, 1024, 3)
		assert (dataset.y == np.ones((1,))).all()

		def test_tif_simple_load(self):
		loader = dc.data.ImageLoader()
		dataset = loader.featurize(self.tif_image_path)

deepchem/data/tests/test_inmemory.py

0 → 100644

+58 −0

Original line number	Diff line number	Diff line
		import deepchem as dc
		import numpy as np


		def test_inmemory_features():
		smiles = ["C", "CC", "CCC", "CCCC"]
		featurizer = dc.feat.CircularFingerprint(size=1024)
		loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
		dataset = loader.create_dataset(smiles, shard_size=2)
		assert len(dataset) == 4
		assert dataset.X.shape == (4, 1024)
		assert dataset.get_number_shards() == 2
		assert (dataset.ids == np.arange(4)).all()


		def test_inmemory_features_and_labels():
		smiles = ["C", "CC", "CCC", "CCCC"]
		labels = [1, 0, 1, 0]
		featurizer = dc.feat.CircularFingerprint(size=1024)
		loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
		dataset = loader.create_dataset(zip(smiles, labels), shard_size=2)
		assert len(dataset) == 4
		assert dataset.X.shape == (4, 1024)
		assert (dataset.y == np.array(labels)).all()
		assert dataset.get_number_shards() == 2
		assert (dataset.ids == np.arange(4)).all()


		def test_inmemory_features_and_labels_and_weights():
		smiles = ["C", "CC", "CCC", "CCCC"]
		labels = [1, 0, 1, 0]
		weights = [1.5, 1.5, 1, 1]
		featurizer = dc.feat.CircularFingerprint(size=1024)
		loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
		dataset = loader.create_dataset(zip(smiles, labels, weights), shard_size=2)
		assert len(dataset) == 4
		assert dataset.X.shape == (4, 1024)
		assert (dataset.y == np.array(labels)).all()
		assert (dataset.w == np.array(weights)).all()
		assert (dataset.ids == np.arange(4)).all()
		assert dataset.get_number_shards() == 2


		def test_inmemory_features_and_labels_and_weights_and_ids():
		smiles = ["C", "CC", "CCC", "CCCC"]
		labels = [1, 0, 1, 0]
		weights = [1.5, 1.5, 1, 1]
		ids = smiles
		featurizer = dc.feat.CircularFingerprint(size=1024)
		loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
		dataset = loader.create_dataset(
		zip(smiles, labels, weights, ids), shard_size=2)
		assert len(dataset) == 4
		assert dataset.X.shape == (4, 1024)
		assert (dataset.y == np.array(labels)).all()
		assert (dataset.w == np.array(weights)).all()
		assert (dataset.ids == np.array(ids)).all()
		assert dataset.get_number_shards() == 2

Admin message