Changes (cf989416) · Commits · 钟慕尧 / deepchem

deepchem/data/init.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -18,3 +18,4 @@ from deepchem.data.data_loader import JsonLoader
		from deepchem.data.data_loader import SDFLoader
		from deepchem.data.data_loader import FASTALoader
		from deepchem.data.data_loader import ImageLoader
		from deepchem.data.data_loader import InMemoryLoader

deepchem/data/data_loader.py

+224 −19

Original line number	Diff line number	Diff line
		@@ -85,6 +85,10 @@ def _featurize_smiles_df(df, featurizer, field, log_every_n=1000):
		The name of a column in `df` that holds SMILES strings
		log_every_n: int, optional (default 1000)
		Emit a logging statement every `log_every_n` rows.

		Note
		----
		This function requires RDKit to be installed
		"""
		sample_elems = df[field].tolist()

		@@ -238,7 +242,7 @@ class DataLoader(object):
		self.featurizer = featurizer
		self.log_every_n = log_every_n

		def featurize(self, input_files, data_dir=None, shard_size=8192):
		def featurize(self, inputs, data_dir=None, shard_size=8192):
		"""Featurize provided files and write to specified location.

		DEPRECATED: This method is now a wrapper for `create_dataset()`
		@@ -253,8 +257,8 @@ class DataLoader(object):

		Parameters
		----------
		input_files: list
		List of input filenames.
		inputs: list
		List of inputs to process. Entries can be filenames or arbitrary objects.
		data_dir: str, optional
		Directory to store featurized dataset.
		shard_size: int, optional
		@@ -263,17 +267,17 @@ class DataLoader(object):
		Returns
		-------
		A `Dataset` object containing a featurized representation of data
		from `input_files`.
		from `input`.
		"""
		warnings.warn(
		"featurize() is deprecated and has been renamed to create_dataset(). featurize() will be removed in DeepChem 3.0",
		FutureWarning)
		return self.create_dataset(input_files, data_dir, shard_size)
		return self.create_dataset(inputs, data_dir, shard_size)

		def create_dataset(self, input_files, data_dir=None, shard_size=8192):
		def create_dataset(self, inputs, data_dir=None, shard_size=8192):
		"""Creates and returns a `Dataset` object by featurizing provided files.

		Reads in `input_files` and uses `self.featurizer` to featurize the
		Reads in `inputs` and uses `self.featurizer` to featurize the
		data in these input files. For large files, automatically shards
		into smaller chunks of `shard_size` datapoints for convenience.
		Returns a `Dataset` object that contains the featurized dataset.
		@@ -285,8 +289,8 @@ class DataLoader(object):

		Parameters
		----------
		input_files: list
		List of input filenames.
		inputs: list
		List of inputs to process. Entries can be filenames or arbitrary objects.
		data_dir: str, optional
		Directory to store featurized dataset.
		shard_size: int, optional
		@@ -295,17 +299,16 @@ class DataLoader(object):
		Returns
		-------
		A `Dataset` object containing a featurized representation of data
		from `input_files`.
		from `inputs`.
		"""
		logger.info("Loading raw samples now.")
		logger.info("shard_size: %d" % shard_size)

		if not isinstance(input_files, list):
		input_files = [input_files]
		if not isinstance(inputs, list):
		inputs = [inputs]

		def shard_generator():
		for shard_num, shard in enumerate(
		self._get_shards(input_files, shard_size)):
		for shard_num, shard in enumerate(self._get_shards(inputs, shard_size)):
		time1 = time.time()
		X, valid_inds = self._featurize_shard(shard)
		ids = shard[self.id_field].values
		@@ -329,11 +332,11 @@ class DataLoader(object):

		return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)

		def _get_shards(self, input_files, shard_size):
		def _get_shards(self, inputs, shard_size):
		"""Stub for children classes.

		Should implement a generator that walks over the source data in
		`input_files` and returns a "shard" at a time. Here a shard is a
		`inputs` and returns a "shard" at a time. Here a shard is a
		chunk of input data that can reasonably be handled in memory. For
		example, this may be a set of rows from a CSV file or a set of
		molecules from a SDF file. To re-use the
		@@ -345,8 +348,8 @@ class DataLoader(object):

		Parameters
		----------
		input_files: list
		List of input filenames.
		inputs: list
		List of inputs to process. Entries can be filenames or arbitrary objects.
		shard_size: int, optional
		Number of examples stored in each shard.
		"""
		@@ -411,7 +414,15 @@ class CSVLoader(DataLoader):
		self.log_every_n = log_every_n

		def _get_shards(self, input_files, shard_size):
		"""Defines a generator which returns data for each shard"""
		"""Defines a generator which returns data for each shard

		Parameters
		----------
		input_files: list[str]
		List of filenames to process
		shard_size: int
		The size of a shard of data to process at a time.
		"""
		return load_csv_files(input_files, shard_size)

		def _featurize_shard(self, shard):
		@@ -812,6 +823,21 @@ class ImageLoader(DataLoader):

		@staticmethod
		def load_img(image_files):
		"""Loads a set of images from disk.

		Parameters
		----------
		image_files: list[str]
		List of image filenames to load

		Returns
		-------
		np.ndarray of that contains loaded images. Of shape `(N,...)`.

		Note
		----
		This method requires PIL to be installed.
		"""
		from PIL import Image
		images = []
		for image_file in image_files:
		@@ -827,3 +853,182 @@ class ImageLoader(DataLoader):
		else:
		raise ValueError("Unsupported image filetype for %s" % image_file)
		return np.array(images)


		class InMemoryLoader(DataLoader):
		"""Facilitate Featurization of In-memory objects.

		When featurizing a dataset, it's often the case that the initial set of
		data (pre-featurization) fits handily within memory. (For example, perhaps
		it fits within a column of a pandas DataFrame.) In this case, it would be
		convenient to directly be able to featurize this column of data. However,
		the process of featurization often generates large arrays which quickly eat
		up available memory. This class provides convenient capabilities to process
		such in-memory data by checkpointing generated features periodically to
		disk.

		Example
		-------
		Here's an example with only datapoints and no labels or weights.

		>>> import deepchem as dc
		>>> smiles = ["C", "CC", "CCC", "CCCC"]
		>>> featurizer = dc.feat.CircularFingerprint()
		>>> loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
		>>> dataset = loader.create_dataset(smiles, shard_size=2)
		>>> len(dataset)
		4

		Here's an example with both datapoints and labels

		>>> import deepchem as dc
		>>> smiles = ["C", "CC", "CCC", "CCCC"]
		>>> labels = [1, 0, 1, 0]
		>>> featurizer = dc.feat.CircularFingerprint()
		>>> loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
		>>> dataset = loader.create_dataset(zip(smiles, labels), shard_size=2)
		>>> len(dataset)
		4

		Here's an example with datapoints, labels, weights and ids all provided.

		>>> import deepchem as dc
		>>> smiles = ["C", "CC", "CCC", "CCCC"]
		>>> labels = [1, 0, 1, 0]
		>>> weights = [1.5, 0, 1.5, 0]
		>>> ids = ["C", "CC", "CCC", "CCCC"]
		>>> featurizer = dc.feat.CircularFingerprint()
		>>> loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
		>>> dataset = loader.create_dataset(zip(smiles, labels, weights, ids), shard_size=2)
		>>> len(dataset)
		4

		"""

		def create_dataset(self, inputs, data_dir=None, shard_size=8192):
		"""Creates and returns a `Dataset` object by featurizing provided files.

		Reads in `inputs` and uses `self.featurizer` to featurize the
		data in these input files. For large files, automatically shards
		into smaller chunks of `shard_size` datapoints for convenience.
		Returns a `Dataset` object that contains the featurized dataset.

		This implementation assumes that the helper methods `_get_shards`
		and `_featurize_shard` are implemented and that each shard
		returned by `_get_shards` is a pandas dataframe. You may choose
		to reuse or override this method in your subclass implementations.

		Parameters
		----------
		inputs: list
		List of inputs to process. Entries can be filenames or arbitrary objects.
		data_dir: str, optional
		Directory to store featurized dataset.
		shard_size: int, optional
		Number of examples stored in each shard.

		Returns
		-------
		A `Dataset` object containing a featurized representation of data
		from `inputs`.
		"""
		logger.info("Loading raw samples now.")
		logger.info("shard_size: %d" % shard_size)

		if not isinstance(inputs, list):
		try:
		inputs = list(inputs)
		except TypeError:
		inputs = [inputs]

		def shard_generator():
		global_index = 0
		for shard_num, shard in enumerate(self._get_shards(inputs, shard_size)):
		time1 = time.time()
		X, y, w, ids = self._featurize_shard(shard, global_index)
		global_index += len(shard)

		time2 = time.time()
		logger.info("TIMING: featurizing shard %d took %0.3f s" %
		(shard_num, time2 - time1))
		yield X, y, w, ids

		return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)

		def _get_shards(self, inputs, shard_size):
		"""Break up input into shards.

		Parameters
		----------
		inputs: list[object]
		Each entry in this list must be of the form `(featurization_input,
		label, weight, id)` or `(featurization_input, label, weight)` or
		`(featurization_input, label)` or `featurization_input` for one
		datapoint, where `featurization_input` is any input that is recognized
		by `self.featurizer`.
		shard_size: int
		The size of shard to generate.

		Returns
		-------
		Iterator which iterates over shards of data.
		"""
		current_shard = []
		for i, datapoint in enumerate(inputs):
		if i != 0 and i % shard_size == 0:
		shard_data = current_shard
		current_shard = []
		yield shard_data
		current_shard.append(datapoint)
		yield current_shard

		def _featurize_shard(self, shard, global_index):
		"""Featurizes a shard of an input data.

		Parameters
		----------
		shard: list
		List each entry of which must be of the form `(featurization_input,
		label, weight, id)` or `(featurization_input, label, weight)` or
		`(featurization_input, label)` or `featurization_input` for one
		datapoint, where `featurization_input` is any input that is recognized
		by `self.featurizer`.
		global_index: int
		The starting index for this shard in the full set of provided inputs
		"""
		features = []
		labels = []
		weights = []
		ids = []
		n_tasks = len(self.tasks)
		for i, entry in enumerate(shard):
		if not isinstance(entry, tuple):
		entry = (entry,)
		if len(entry) > 4:
		raise ValueError(
		"Entry is malformed and must be of length 1-4 containing featurization_input and optionally label, weight, and id."
		)
		if len(entry) == 4:
		featurization_input, label, weight, entry_id = entry
		elif len(entry) == 3:
		featurization_input, label, weight = entry
		entry_id = global_index + i
		elif len(entry) == 2:
		featurization_input, label = entry
		weight = np.ones((n_tasks), np.float32)
		entry_id = global_index + i
		elif len(entry) == 1:
		featurization_input = entry
		label = np.zeros((n_tasks), np.float32)
		weight = np.zeros((n_tasks), np.float32)
		entry_id = global_index + i
		feature = self.featurizer(featurization_input)
		features.append(feature)
		weights.append(weight)
		labels.append(label)
		ids.append(entry_id)
		X = np.concatenate(features, axis=0)
		y = np.array(labels)
		w = np.array(weights)
		ids = np.array(ids)
		return X, y, w, ids

deepchem/data/tests/test_inmemory.py

0 → 100644

+58 −0

Original line number	Diff line number	Diff line
		import deepchem as dc
		import numpy as np


		def test_inmemory_features():
		smiles = ["C", "CC", "CCC", "CCCC"]
		featurizer = dc.feat.CircularFingerprint(size=1024)
		loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
		dataset = loader.create_dataset(smiles, shard_size=2)
		assert len(dataset) == 4
		assert dataset.X.shape == (4, 1024)
		assert dataset.get_number_shards() == 2
		assert (dataset.ids == np.arange(4)).all()


		def test_inmemory_features_and_labels():
		smiles = ["C", "CC", "CCC", "CCCC"]
		labels = [1, 0, 1, 0]
		featurizer = dc.feat.CircularFingerprint(size=1024)
		loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
		dataset = loader.create_dataset(zip(smiles, labels), shard_size=2)
		assert len(dataset) == 4
		assert dataset.X.shape == (4, 1024)
		assert (dataset.y == np.array(labels)).all()
		assert dataset.get_number_shards() == 2
		assert (dataset.ids == np.arange(4)).all()


		def test_inmemory_features_and_labels_and_weights():
		smiles = ["C", "CC", "CCC", "CCCC"]
		labels = [1, 0, 1, 0]
		weights = [1.5, 1.5, 1, 1]
		featurizer = dc.feat.CircularFingerprint(size=1024)
		loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
		dataset = loader.create_dataset(zip(smiles, labels, weights), shard_size=2)
		assert len(dataset) == 4
		assert dataset.X.shape == (4, 1024)
		assert (dataset.y == np.array(labels)).all()
		assert (dataset.w == np.array(weights)).all()
		assert (dataset.ids == np.arange(4)).all()
		assert dataset.get_number_shards() == 2


		def test_inmemory_features_and_labels_and_weights_and_ids():
		smiles = ["C", "CC", "CCC", "CCCC"]
		labels = [1, 0, 1, 0]
		weights = [1.5, 1.5, 1, 1]
		ids = smiles
		featurizer = dc.feat.CircularFingerprint(size=1024)
		loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
		dataset = loader.create_dataset(
		zip(smiles, labels, weights, ids), shard_size=2)
		assert len(dataset) == 4
		assert dataset.X.shape == (4, 1024)
		assert (dataset.y == np.array(labels)).all()
		assert (dataset.w == np.array(weights)).all()
		assert (dataset.ids == np.array(ids)).all()
		assert dataset.get_number_shards() == 2

deepchem/feat/base_classes.py

+9 −0

Original line number	Diff line number	Diff line
		@@ -183,6 +183,8 @@ class MolecularFeaturizer(Featurizer):
		"""
		try:
		from rdkit import Chem
		from rdkit.Chem import rdmolfiles
		from rdkit.Chem import rdmolops
		from rdkit.Chem.rdchem import Mol
		except ModuleNotFoundError:
		raise ValueError("This class requires RDKit to be installed.")
		@@ -201,6 +203,13 @@ class MolecularFeaturizer(Featurizer):
		if isinstance(mol, str):
		# mol must be a SMILES string so parse
		mol = Chem.MolFromSmiles(mol)
		# TODO (ytz) this is a bandage solution to reorder the atoms
		# so that they're always in the same canonical order.
		# Presumably this should be correctly implemented in the
		# future for graph mols.
		if mol:
		new_order = rdmolfiles.CanonicalRankAtoms(mol)
		mol = rdmolops.RenumberAtoms(mol, new_order)
		features.append(self._featurize(mol))
		except:
		logger.warning(

deepchem/utils/save.py

+55 −13

Original line number	Diff line number	Diff line
		@@ -45,21 +45,32 @@ def get_input_type(input_file):
		raise ValueError("Unrecognized extension %s" % file_extension)


		def load_data(input_files, shard_size=None, verbose=True):
		def load_data(input_files, shard_size=None):
		"""Loads data from disk.

		For CSV files, supports sharded loading for large files.

		Parameters
		----------
		input_files: list
		List of filenames.
		shard_size: int, optional (default None)
		Size of shard to yield

		Returns
		-------
		Iterator which iterates over provided files.
		"""
		if not len(input_files):
		return
		input_type = get_input_type(input_files[0])
		if input_type == "sdf":
		if shard_size is not None:
		logger.info("Ignoring shard_size for sdf input.", verbose)
		logger.info("Ignoring shard_size for sdf input.")
		for value in load_sdf_files(input_files):
		yield value
		elif input_type == "csv":
		for value in load_csv_files(input_files, shard_size, verbose=verbose):
		for value in load_csv_files(input_files, shard_size):
		yield value
		elif input_type == "pandas-pickle":
		for input_file in input_files:
		@@ -67,7 +78,29 @@ def load_data(input_files, shard_size=None, verbose=True):


		def load_sdf_files(input_files, clean_mols, tasks=[]):
		"""Load SDF file into dataframe."""
		"""Load SDF file into dataframe.

		Parameters
		----------
		input_files: list[str]
		List of filenames
		clean_mols: bool
		Whether to sanitize molecules.
		tasks: list, optional (default [])
		Each entry in `tasks` is treated as a property in the SDF file and is
		retrieved with `mol.GetProp(str(task))` where `mol` is the RDKit mol
		loaded from a given SDF entry.

		Note
		----
		This function requires RDKit to be installed.

		Returns
		-------
		dataframes: list
		This function returns a list of pandas dataframes. Each dataframe will
		columns `('mol_id', 'smiles', 'mol')`.
		"""
		from rdkit import Chem
		dataframes = []
		for input_file in input_files:
		@@ -97,19 +130,30 @@ def load_sdf_files(input_files, clean_mols, tasks=[]):
		return dataframes


		def load_csv_files(filenames, shard_size=None, verbose=True):
		"""Load data as pandas dataframe."""
		def load_csv_files(filenames, shard_size=None):
		"""Load data as pandas dataframe.

		Parameters
		----------
		input_files: list[str]
		List of filenames
		shard_size: int, optional (default None)
		The shard size to yield at one time.

		Returns
		-------
		Iterator which iterates over shards of data.
		"""
		# First line of user-specified CSV must be header.
		shard_num = 1
		for filename in filenames:
		if shard_size is None:
		yield pd.read_csv(filename)
		else:
		logger.info("About to start loading CSV from %s" % filename, verbose)
		logger.info("About to start loading CSV from %s" % filename)
		for df in pd.read_csv(filename, chunksize=shard_size):
		logger.info(
		"Loading shard %d of size %s." % (shard_num, str(shard_size)),
		verbose)
		"Loading shard %d of size %s." % (shard_num, str(shard_size)))
		df = df.replace(np.nan, str(""), regex=True)
		shard_num += 1
		yield df
		@@ -227,8 +271,8 @@ def encode_bio_sequence(fname, file_type="fasta", letters="ATCGN"):


		def save_metadata(tasks, metadata_df, data_dir):
		"""
		Saves the metadata for a DiskDataset
		"""Saves the metadata for a DiskDataset

		Parameters
		----------
		tasks: list of str
		@@ -236,8 +280,6 @@ def save_metadata(tasks, metadata_df, data_dir):
		metadata_df: pd.DataFrame
		data_dir: str
		Directory to store metadata
		Returns
		-------
		"""
		if isinstance(tasks, np.ndarray):
		tasks = tasks.tolist()

Admin message