first (e2fd409b) · Commits · 钟慕尧 / deepchem

deepchem/data/data_loader.py

+256 −111

Original line number	Diff line number	Diff line
		@@ -10,7 +10,7 @@ import numbers
		import tempfile
		import time
		import sys
		from deepchem.utils.save import log
		import logging
		from deepchem.utils.save import load_csv_files
		from deepchem.utils.save import load_sdf_files
		from deepchem.utils.genomics import encode_fasta_sequence
		@@ -19,8 +19,10 @@ from deepchem.data import DiskDataset, NumpyDataset, ImageDataset
		import zipfile
		from PIL import Image

		logger = logging.getLogger(__name__)

		def convert_df_to_numpy(df, tasks, verbose=False):

		def _convert_df_to_numpy(df, tasks):
		"""Transforms a dataframe containing deepchem input into numpy arrays"""
		n_samples = df.shape[0]
		n_tasks = len(tasks)
		@@ -50,12 +52,12 @@ def convert_df_to_numpy(df, tasks, verbose=False):
		return y.astype(float), w.astype(float)


		def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True):
		def _featurize_smiles_df(df, featurizer, field, log_every_N=1000):
		"""Featurize individual compounds in dataframe.

		Given a featurizer that operates on individual chemical compounds
		or macromolecules, compute & add features for that compound to the
		features dataframe
		Given a featurizer that operates on individual chemical
		compounds or macromolecules, compute & add features for that
		compound to the features dataframe
		"""
		sample_elems = df[field].tolist()

		@@ -65,14 +67,15 @@ def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True):
		from rdkit.Chem import rdmolops
		for ind, elem in enumerate(sample_elems):
		mol = Chem.MolFromSmiles(elem)
		# TODO (ytz) this is a bandage solution to reorder the atoms so
		# that they're always in the same canonical order. Presumably this
		# should be correctly implemented in the future for graph mols.
		# TODO (ytz) this is a bandage solution to reorder the atoms
		# so that they're always in the same canonical order.
		# Presumably this should be correctly implemented in the
		# future for graph mols.
		if mol:
		new_order = rdmolfiles.CanonicalRankAtoms(mol)
		mol = rdmolops.RenumberAtoms(mol, new_order)
		if ind % log_every_N == 0:
		log("Featurizing sample %d" % ind, verbose)
		logger.info("Featurizing sample %d" % ind)
		features.append(featurizer.featurize([mol]))
		valid_inds = np.array(
		[1 if elt.size > 0 else 0 for elt in features], dtype=bool)
		@@ -80,34 +83,34 @@ def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True):
		return np.squeeze(np.array(features), axis=1), valid_inds


		def featurize_smiles_np(arr, featurizer, log_every_N=1000, verbose=True):
		"""Featurize individual compounds in a numpy array.

		Given a featurizer that operates on individual chemical compounds
		or macromolecules, compute & add features for that compound to the
		features array
		"""
		features = []
		from rdkit import Chem
		from rdkit.Chem import rdmolfiles
		from rdkit.Chem import rdmolops
		for ind, elem in enumerate(arr.tolist()):
		mol = Chem.MolFromSmiles(elem)
		if mol:
		new_order = rdmolfiles.CanonicalRankAtoms(mol)
		mol = rdmolops.RenumberAtoms(mol, new_order)
		if ind % log_every_N == 0:
		log("Featurizing sample %d" % ind, verbose)
		features.append(featurizer.featurize([mol]))

		valid_inds = np.array(
		[1 if elt.size > 0 else 0 for elt in features], dtype=bool)
		features = [elt for (is_valid, elt) in zip(valid_inds, features) if is_valid]
		features = np.squeeze(np.array(features))
		return features.reshape(-1,)


		def get_user_specified_features(df, featurizer, verbose=True):
		#def _featurize_smiles_np(arr, featurizer, log_every_N=1000):
		# """Featurize individual compounds in a numpy array.
		#
		# Given a featurizer that operates on individual chemical compounds
		# or macromolecules, compute & add features for that compound to the
		# features array
		# """
		# features = []
		# from rdkit import Chem
		# from rdkit.Chem import rdmolfiles
		# from rdkit.Chem import rdmolops
		# for ind, elem in enumerate(arr.tolist()):
		# mol = Chem.MolFromSmiles(elem)
		# if mol:
		# new_order = rdmolfiles.CanonicalRankAtoms(mol)
		# mol = rdmolops.RenumberAtoms(mol, new_order)
		# if ind % log_every_N == 0:
		# logger.info("Featurizing sample %d" % ind)
		# features.append(featurizer.featurize([mol]))
		#
		# valid_inds = np.array(
		# [1 if elt.size > 0 else 0 for elt in features], dtype=bool)
		# features = [elt for (is_valid, elt) in zip(valid_inds, features) if is_valid]
		# features = np.squeeze(np.array(features))
		# return features.reshape(-1,)


		def _get_user_specified_features(df, featurizer):
		"""Extract and merge user specified features.

		Merge features included in dataset provided by user
		@@ -128,26 +131,32 @@ def get_user_specified_features(df, featurizer, verbose=True):
		pd.to_numeric)
		X_shard = df[featurizer.feature_fields].to_numpy()
		time2 = time.time()
		log("TIMING: user specified processing took %0.3f s" % (time2 - time1),
		verbose)
		logger.info("TIMING: user specified processing took %0.3f s" % (time2 - time1))
		return X_shard


		def featurize_mol_df(df, featurizer, field, verbose=True, log_every_N=1000):
		def _featurize_mol_df(df, featurizer, field, log_every_N=1000):
		"""Featurize individual compounds in dataframe.

		Featurizes .sdf files, so the 3-D structure should be preserved
		so we use the rdkit "mol" object created from .sdf instead of smiles
		string. Some featurizers such as CoulombMatrix also require a 3-D
		structure. Featurizing from .sdf is currently the only way to
		perform CM feautization.
		Used when processing .sdf files, so the 3-D structure should be
		preserved. We use the rdkit "mol" object created from .sdf
		instead of smiles string. Some featurizers such as
		CoulombMatrix also require a 3-D structure. Featurizing from
		.sdf is currently the only way to perform CM feautization.

		Parameters
		----------
		df: Pandas Dataframe
		Should be created by dc.utils.save.load_sdf_files.
		featurizer: dc.feat.MolecularFeaturizer
		Featurizer for molecules.
		"""
		sample_elems = df[field].tolist()

		features = []
		for ind, mol in enumerate(sample_elems):
		if ind % log_every_N == 0:
		log("Featurizing sample %d" % ind, verbose)
		logger.info("Featurizing sample %d" % ind)
		features.append(featurizer.featurize([mol]))
		valid_inds = np.array(
		[1 if elt.size > 0 else 0 for elt in features], dtype=bool)
		@@ -156,32 +165,50 @@ def featurize_mol_df(df, featurizer, field, verbose=True, log_every_N=1000):


		class DataLoader(object):
		"""
		Handles loading/featurizing of chemical samples (datapoints).

		Currently knows how to load csv-files/pandas-dataframes/SDF-files. Writes a
		dataframe object to disk as output.
		"""Handles loading/featurizing of data from disk.

		The `Featurizer` objects can featurize provided input into
		numpy arrays but doesn't generate `Dataset` objects. You can
		of course wrap numpy arrays into `Dataset` objects with
		`dc.data.NumpyDataset`, but you might face some difficulty
		with larger dataset processing. The main use of `DataLoader`
		and its child classes is to make it easier to load large
		datasets into `Dataset` objects.` You won't ever "need" to use
		a `DataLoader` but might often find it convenient when
		processing larger datasets.

		Note that `DataLoader` is an abstract superclass that
		provides a general framework for loading data into DeepChem.
		To load your own type of data, make a subclass of
		`DataLoader` and provide your own implementation for
		`featurize`.
		"""

		def __init__(self,
		tasks,
		smiles_field=None,
		id_field=None,
		mol_field=None,
		featurizer=None,
		verbose=True,
		log_every_n=1000):
		"""Extracts data from input as Pandas data frame"""
		"""Construct a DataLoader object.

		This constructor is provided as a template mainly. You
		shouldn't ever call this constructor directly as a user.

		Parameters
		----------
		tasks: list[str]
		List of task names
		id_field: str, optional
		Name of field that holds sample identifier
		featurizer: dc.feat.Featurizer, optional
		Featurizer to use to process data
		log_every_n: int, optional
		Writes a logging statement this often.
		"""
		if not isinstance(tasks, list):
		raise ValueError("tasks must be a list.")
		self.verbose = verbose
		self.tasks = tasks
		self.smiles_field = smiles_field
		if id_field is None:
		self.id_field = smiles_field
		else:
		self.id_field = id_field
		self.mol_field = mol_field
		self.user_specified_features = None
		if isinstance(featurizer, UserDefinedFeaturizer):
		self.user_specified_features = featurizer.feature_fields
		@@ -198,50 +225,50 @@ class DataLoader(object):
		----------
		input_files: list
		List of input filenames.
		data_dir: str
		(Optional) Directory to store featurized dataset.
		shard_size: int
		(Optional) Number of examples stored in each shard.
		data_dir: str, optional
		Directory to store featurized dataset.
		shard_size: int, optional
		Number of examples stored in each shard.
		"""
		log("Loading raw samples now.", self.verbose)
		log("shard_size: %d" % shard_size, self.verbose)
		logger.info("Loading raw samples now.")
		logger.info("shard_size: %d" % shard_size)

		if not isinstance(input_files, list):
		input_files = [input_files]

		def shard_generator():
		for shard_num, shard in enumerate(
		self.get_shards(input_files, shard_size)):
		self._get_shards(input_files, shard_size)):
		time1 = time.time()
		X, valid_inds = self.featurize_shard(shard)
		X, valid_inds = self._featurize_shard(shard)
		ids = shard[self.id_field].values
		ids = ids[valid_inds]
		if len(self.tasks) > 0:
		# Featurize task results iff they exist.
		y, w = convert_df_to_numpy(shard, self.tasks, self.id_field)
		y, w = _convert_df_to_numpy(shard, self.tasks)
		# Filter out examples where featurization failed.
		y, w = (y[valid_inds], w[valid_inds])
		assert len(X) == len(ids) == len(y) == len(w)
		else:
		# For prospective data where results are unknown, it makes
		# no sense to have y values or weights.
		# For prospective data where results are unknown, it
		# makes no sense to have y values or weights.
		y, w = (None, None)
		assert len(X) == len(ids)

		time2 = time.time()
		log(
		logger.info(
		"TIMING: featurizing shard %d took %0.3f s" %
		(shard_num, time2 - time1), self.verbose)
		(shard_num, time2 - time1))
		yield X, y, w, ids

		return DiskDataset.create_dataset(
		shard_generator(), data_dir, self.tasks, verbose=self.verbose)
		shard_generator(), data_dir, self.tasks)

		def get_shards(self, input_files, shard_size):
		def _get_shards(self, input_files, shard_size):
		"""Stub for children classes."""
		raise NotImplementedError

		def featurize_shard(self, shard):
		def _featurize_shard(self, shard):
		"""Featurizes a shard of an input dataframe."""
		raise NotImplementedError

		@@ -249,30 +276,72 @@ class DataLoader(object):
		class CSVLoader(DataLoader):
		"""
		Handles loading of CSV files.

		This class provides conveniences to load data from CSV files.
		It's possible to directly featurize data from CSV files using
		pandas, but this class may prove useful if you're processing
		large CSV files that you don't want to manipulate directly in
		memory.
		"""

		def get_shards(self, input_files, shard_size, verbose=True):
		def __init__(self,
		tasks,
		smiles_field=None,
		id_field=None,
		featurizer=None,
		log_every_n=1000):
		"""Initializes CSVLoader.

		Parameters
		----------
		tasks: list[str]
		List of task names
		smiles_field: str, optional
		Name of field that holds smiles string
		id_field: str, optional
		Name of field that holds sample identifier
		featurizer: dc.feat.Featurizer, optional
		Featurizer to use to process data
		log_every_n: int, optional
		Writes a logging statement this often.
		"""
		if not isinstance(tasks, list):
		raise ValueError("tasks must be a list.")
		self.tasks = tasks
		self.smiles_field = smiles_field
		if id_field is None:
		self.id_field = smiles_field
		else:
		self.id_field = id_field
		#self.mol_field = mol_field
		self.user_specified_features = None
		if isinstance(featurizer, UserDefinedFeaturizer):
		self.user_specified_features = featurizer.feature_fields
		self.featurizer = featurizer
		self.log_every_n = log_every_n

		def _get_shards(self, input_files, shard_size):
		"""Defines a generator which returns data for each shard"""
		return load_csv_files(input_files, shard_size, verbose=verbose)
		return load_csv_files(input_files, shard_size)

		def featurize_shard(self, shard):
		def _featurize_shard(self, shard):
		"""Featurizes a shard of an input dataframe."""
		return featurize_smiles_df(shard, self.featurizer, field=self.smiles_field)
		return _featurize_smiles_df(shard, self.featurizer, field=self.smiles_field, log_every_N=self.log_every_n)


		class UserCSVLoader(DataLoader):
		class UserCSVLoader(CSVLoader):
		"""
		Handles loading of CSV files with user-defined featurizers.
		"""

		def get_shards(self, input_files, shard_size):
		def _get_shards(self, input_files, shard_size):
		"""Defines a generator which returns data for each shard"""
		return load_csv_files(input_files, shard_size)

		def featurize_shard(self, shard):
		def _featurize_shard(self, shard):
		"""Featurizes a shard of an input dataframe."""
		assert isinstance(self.featurizer, UserDefinedFeaturizer)
		X = get_user_specified_features(shard, self.featurizer)
		X = _get_user_specified_features(shard, self.featurizer)
		return (X, np.ones(len(X), dtype=bool))


		@@ -281,34 +350,55 @@ class SDFLoader(DataLoader):
		Handles loading of SDF files.
		"""

		def __init__(self, tasks, clean_mols=False, **kwargs):
		super(SDFLoader, self).__init__(tasks, **kwargs)
		def __init__(self, tasks, clean_mols=False, featurizer=None, log_every_n=1000):
		"""Initialize SDF Loader

		Parameters
		----------
		tasks: list[str]
		List of tasknames. These will be loaded from the SDF file.
		clean_mols: bool, optional
		Whether to sanitize molecules.
		featurizer: dc.feat.Featurizer, optional
		Featurizer to use to process data
		log_every_n: int, optional
		Writes a logging statement this often.
		"""
		self.featurizer = featurizer
		self.clean_mols = clean_mols
		self.tasks = tasks
		self.smiles_field = "smiles"
		# The field in which dc.utils.save.load_sdf_files stores
		# RDKit mol objects
		self.mol_field = "mol"
		# The field in which load_sdf_files return value stores
		# smiles
		self.id_field = "smiles"
		self.log_every_n = log_every_n

		def get_shards(self, input_files, shard_size):
		def _get_shards(self, input_files, shard_size):
		"""Defines a generator which returns data for each shard"""
		return load_sdf_files(input_files, self.clean_mols, tasks=self.tasks)

		def featurize_shard(self, shard):
		def _featurize_shard(self, shard):
		"""Featurizes a shard of an input dataframe."""
		log(
		logger.info(
		"Currently featurizing feature_type: %s" %
		self.featurizer.__class__.__name__, self.verbose)
		return featurize_mol_df(shard, self.featurizer, field=self.mol_field)
		self.featurizer.__class__.__name__)
		return _featurize_mol_df(shard, self.featurizer, field=self.mol_field, log_every_N=self.log_every_n)


		class FASTALoader(DataLoader):
		"""
		Handles loading of FASTA files.
		"""Handles loading of FASTA files.

		FASTA files are commonly used to hold sequence data. This
		class provides convenience files to lead FASTA data and
		one-hot encode the genomic sequences for use in downstream
		learning tasks.
		"""

		def __init__(self, verbose=True):
		def __init__(self):
		"""Initialize loader."""
		self.verbose = verbose
		pass

		def featurize(self, input_files, data_dir=None):
		"""Featurizes fasta files.
		@@ -317,8 +407,8 @@ class FASTALoader(DataLoader):
		----------
		input_files: list
		List of fasta files.
		data_dir: str
		(Optional) Name of directory where featurized data is stored.
		data_dir: str, optional
		Name of directory where featurized data is stored.
		"""
		if not isinstance(input_files, list):
		input_files = [input_files]
		@@ -334,17 +424,22 @@ class FASTALoader(DataLoader):


		class ImageLoader(DataLoader):
		"""
		Handles loading of image files.
		"""Handles loading of image files.

		This class allows for loading of images in various formats. For user
		convenience, also accepts zip-files and directories of images and uses some
		limited intelligence to attempt to traverse subdirectories which contain
		images.
		This class allows for loading of images in various formats.
		For user convenience, also accepts zip-files and directories
		of images and uses some limited intelligence to attempt to
		traverse subdirectories which contain images.
		"""

		def __init__(self, tasks=None):
		"""Initialize image loader."""
		"""Initialize image loader.

		Parameters
		----------
		tasks: list[str]
		List of task names for image labels.
		"""
		if tasks is None:
		tasks = []
		self.tasks = tasks
		@@ -355,9 +450,13 @@ class ImageLoader(DataLoader):
		Parameters
		----------
		input_files: list
		Each file in this list should either be of a supported image format
		(.png, .tif only for now) or of a compressed folder of image files
		(only .zip for now).
		Each file in this list should either be of a supported
		image format (.png, .tif only for now) or of a compressed
		folder of image files (only .zip for now).
		labels: optional
		If provided, a numpy ndarray of image labels
		weights: optional
		If provided, a numpy ndarray of image weights
		in_memory: bool
		If true, return in-memory NumpyDataset. Else return ImageDataset.
		"""
		@@ -419,3 +518,49 @@ class ImageLoader(DataLoader):
		else:
		raise ValueError("Unsupported image filetype for %s" % image_file)
		return np.array(images)

		class MolecularComplexLoader(DataLoader):
		"""Handles Loading of Molecular Complex Data

		This class provides conveniences to load and featurize
		datasets of macromolecular complexes. The idea here is that
		each "datapoint" is specified by one or more PDB/sdf files
		which hold the 3D structures for the sample that you're
		considering. This loader will load these complexes and
		featurize them for you.

		Featurizing macromolecular complex data can take a long time,
		so for convenience, this class provides restart capabilities
		which will restart a stopped featurization process for a
		collection of complexes.
		"""

		def __init__(self, data_dir=None):
		"""Initialize MolecularComplexLoader.

		Parameters
		----------
		data_dir: str, optional
		Directory to use for saving intermediate featurizations
		and the final produced dataset.
		"""
		raise NotImplementedError

		def featurize(self, input_files, labels=None, weights=None):
		"""Featurizes Macromolecular Complex Data.

		Parameters
		----------
		input_files: list
		Each entry in this list should be the collection of all
		files for a given complex. If only one file is present,
		this is just a string for the filename. Otherwise, this
		should be a list of the filenames for the constituent
		files.
		labels: optional
		If provided, a numpy ndarray of image labels
		weights: optional
		If provided, a numpy ndarray of image weights
		"""
		raise NotImplementedError

deepchem/data/test_data_loader.py→deepchem/data/tests/test_csv_loader.py

+0 −0

File moved.

View file

Admin message