Merge pull request #2565 from alat-rights/FASTALoaderNew (aaed65f7) · Commits · 钟慕尧 / deepchem

deepchem/data/data_loader.py

+115 −8

Original line number	Diff line number	Diff line
		@@ -14,9 +14,10 @@ import numpy as np

		from deepchem.utils.typing import OneOrMany
		from deepchem.utils.data_utils import load_image_files, load_csv_files, load_json_files, load_sdf_files
		from deepchem.utils.genomics_utils import encode_bio_sequence
		from deepchem.feat import UserDefinedFeaturizer, Featurizer
		from deepchem.data import Dataset, DiskDataset, NumpyDataset, ImageDataset
		from deepchem.feat.molecule_featurizers import OneHotFeaturizer
		from deepchem.utils.genomics_utils import encode_bio_sequence

		logger = logging.getLogger(__name__)

		@@ -875,9 +876,70 @@ class FASTALoader(DataLoader):
		learning tasks.
		"""

		def __init__(self):
		"""Initialize loader."""
		pass
		def __init__(self,
		featurizer: Optional[Featurizer] = None,
		auto_add_annotations: bool = False,
		legacy: bool = True):
		"""Initialize FASTALoader.

		Parameters
		----------
		featurizer: Featurizer (default: None)
		The Featurizer to be used for the loaded FASTA data.

		If featurizer is None and legacy is True, the original featurization
		logic is used, creating a one hot encoding of all included FASTA strings
		of shape
		(number of FASTA sequences, number of channels + 1, sequence length, 1).

		If featurizer is None and legacy is False, the featurizer is initialized
		as a OneHotFeaturizer object with charset ("A", "C", "T", "G") and
		max_length = None.

		auto_add_annotations: bool (default False)
		Whether create_dataset will automatically add [CLS] and [SEP] annotations
		to the sequences it reads in order to assist tokenization.
		Keep False if your FASTA file already includes [CLS] and [SEP] annotations.

		legacy: bool (default True)
		Whether to use legacy logic for featurization. Legacy mode will create
		a one hot encoding of the FASTA content of shape
		(number of FASTA sequences, number of channels + 1, max length, 1).

		Legacy mode is only tested for ACTGN charsets, and will be deprecated.
		"""

		# Process legacy toggle
		if legacy:
		warnings.warn(
		"""
		Legacy mode is deprecated and will be removed in
		DeepChem 3.0. Disable legacy mode by passing legacy=False
		during construction of FASTALoader object.
		""", FutureWarning)
		if featurizer is not None or auto_add_annotations:
		raise ValueError(f"""
		featurizer option must be None and
		auto_add_annotations must be false when legacy mode
		is enabled. You set featurizer to {featurizer} and
		auto_add_annotations to {auto_add_annotations}.
		""")

		# Set attributes
		self.legacy = legacy
		self.auto_add_annotations = auto_add_annotations

		self.user_specified_features = None

		# Handle special featurizer cases
		if isinstance(featurizer, UserDefinedFeaturizer): # User defined featurizer
		self.user_specified_features = featurizer.feature_fields
		elif featurizer is None: # Default featurizer
		featurizer = OneHotFeaturizer(
		charset=["A", "C", "T", "G"], max_length=None)

		# Set self.featurizer
		self.featurizer = featurizer

		def create_dataset(self,
		input_files: OneOrMany[str],
		@@ -885,8 +947,7 @@ class FASTALoader(DataLoader):
		shard_size: Optional[int] = None) -> DiskDataset:
		"""Creates a `Dataset` from input FASTA files.

		At present, FASTA support is limited and only allows for one-hot
		featurization, and doesn't allow for sharding.
		At present, FASTA support is limited and doesn't allow for sharding.

		Parameters
		----------
		@@ -907,13 +968,59 @@ class FASTALoader(DataLoader):
		if isinstance(input_files, str):
		input_files = [input_files]

		def shard_generator():
		def shard_generator(): # TODO Enable sharding with shard size parameter
		for input_file in input_files:
		if self.legacy:
		X = encode_bio_sequence(input_file)
		else:
		sequences = _read_file(input_file)
		X = self.featurizer(sequences)
		ids = np.ones(len(X))
		# (X, y, w, ids)
		yield X, None, None, ids

		def _read_file(input_file: str, auto_add_annotations: bool = False):
		"""
		Convert the FASTA file to a numpy array of FASTA-format strings.
		"""

		# TODO don't convert all sequences into np array (allow shards)
		def _generate_sequences(fasta_file, header_mark=">") -> np.array:
		"""
		Uses a fasta_file to create a numpy array of annotated FASTA-format strings
		"""
		sequences = np.array([])
		sequence = np.array([])
		header_read = False
		for line in fasta_file:
		# Check if line is a header
		if line.startswith(header_mark): # New header line
		header_read = True
		sequences = _add_sequence(sequences, sequence)
		sequence = np.array([])
		elif header_read: # Line contains sequence in FASTA format
		if line[-1:] == '\n': # Check last character in string
		line = line[0:-1] # Remove last character
		sequence = np.append(sequence, line)
		sequences = _add_sequence(sequences, sequence) # Add last sequence
		return sequences

		def _add_sequence(sequences: np.array, sequence: np.array) -> np.array:
		# Handle empty sequence
		if sequence is None or len(sequence) <= 0:
		# TODO log attempts to add empty sequences every shard
		return np.array([])
		# Annotate start/stop of sequence
		if auto_add_annotations:
		sequence = np.insert(sequence, 0, "[CLS]")
		sequence = np.append(sequence, "[SEP]")
		new_sequence = ''.join(sequence)
		new_sequences = np.append(sequences, new_sequence)
		return new_sequences

		with open(input_file, 'r') as f: # Read FASTA file
		return _generate_sequences(f)

		return DiskDataset.create_dataset(shard_generator(), data_dir)

deepchem/data/tests/test_fasta_loader.py

+29 −2

Original line number	Diff line number	Diff line
		@@ -5,6 +5,7 @@ import os
		import unittest

		import deepchem as dc
		from deepchem.feat.molecule_featurizers import OneHotFeaturizer


		class TestFASTALoader(unittest.TestCase):
		@@ -16,13 +17,39 @@ class TestFASTALoader(unittest.TestCase):
		super(TestFASTALoader, self).setUp()
		self.current_dir = os.path.dirname(os.path.abspath(__file__))

		def test_fasta_load(self):
		def test_legacy_fasta_one_hot(self):
		input_file = os.path.join(self.current_dir,
		"../../data/tests/example.fasta")
		loader = dc.data.FASTALoader()
		loader = dc.data.FASTALoader(legacy=True)
		sequences = loader.create_dataset(input_file)

		# example.fasta contains 3 sequences each of length 58.
		# The one-hot encoding turns base-pairs into vectors of length 5 (ATCGN).
		# There is one "image channel".

		assert sequences.X.shape == (3, 5, 58, 1)

		def test_fasta_one_hot(self):
		input_file = os.path.join(self.current_dir,
		"../../data/tests/example.fasta")
		loader = dc.data.FASTALoader(legacy=False)
		sequences = loader.create_dataset(input_file)

		# Due to FASTALoader redesign, expected shape is now (3, 58, 5)

		assert sequences.X.shape == (3, 58, 5)

		def test_fasta_one_hot_big(self):
		protein = [
		'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
		'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '*', '-'
		]
		input_file = os.path.join(self.current_dir,
		"../../data/tests/uniprot_truncated.fasta")
		loader = dc.data.FASTALoader(
		OneHotFeaturizer(charset=protein, max_length=1000), legacy=False)
		sequences = loader.create_dataset(input_file)

		assert sequences.X.shape

		# TODO: test with full uniprot file once sharding support is added.

deepchem/data/tests/uniprot_truncated.fasta

0 → 100644

+354 −0

File added.

Preview size limit exceeded, changes collapsed.

deepchem/feat/molecule_featurizers/one_hot_featurizer.py

+27 −18

Original line number	Diff line number	Diff line
		@@ -6,7 +6,7 @@ import numpy as np
		from deepchem.utils.typing import RDKitMol
		from deepchem.utils.molecule_feature_utils import one_hot_encode
		from deepchem.feat.base_classes import Featurizer
		from typing import Any, Iterable
		from typing import Any, Iterable, Optional

		logger = logging.getLogger(__name__)

		@@ -32,21 +32,30 @@ class OneHotFeaturizer(Featurizer):
		It does not need RDKit to be installed to work with arbitrary strings.
		"""

		def __init__(self, charset: List[str] = ZINC_CHARSET, max_length: int = 100):
		def __init__(self,
		charset: List[str] = ZINC_CHARSET,
		max_length: Optional[int] = 100):
		"""Initialize featurizer.

		Parameters
		----------
		charset: List[str], optional (default ZINC_CHARSET)
		charset: List[str] (default ZINC_CHARSET)
		A list of strings, where each string is length 1 and unique.
		max_length: int, optional (default 100)
		The max length for SMILES string. If the length of SMILES string is
		shorter than max_length, the SMILES is padded using space.
		max_length: Optional[int], optional (default 100)
		The max length for string. If the length of string is shorter than
		max_length, the string is padded using space.

		If max_length is None, no padding is performed and arbitrary length
		strings are allowed.
		"""
		if len(charset) != len(set(charset)):
		raise ValueError("All values in charset must be unique.")
		self.charset = charset
		self.max_length = max_length
		self.max_length = Optional[int]
		if max_length is not None:
		self.max_length = int(max_length)
		else:
		self.max_length = None

		def featurize(self, datapoints: Iterable[Any],
		log_every_n: int = 1000) -> np.ndarray:
		@@ -55,19 +64,19 @@ class OneHotFeaturizer(Featurizer):
		Parameters
		----------
		datapoints: list
		A list of either strings or RDKit molecules.
		A list of either strings (str or numpy.str_) or RDKit molecules.
		log_every_n: int, optional (default 1000)
		How many elements are featurized every time a featurization is logged.
		"""
		datapoints = list(datapoints)
		if (len(datapoints) < 1):
		return np.array([])
		# Featurize data using featurize() in grandparent class
		# Featurize data using featurize() in parent class
		return Featurizer.featurize(self, datapoints, log_every_n)

		def _featurize(self, datapoint: Any):
		# Featurize str data
		if (type(datapoint) == str):
		if isinstance(datapoint, (str, np.str_)):
		return self._featurize_string(datapoint)
		# Featurize mol data
		else:
		@@ -88,14 +97,11 @@ class OneHotFeaturizer(Featurizer):
		The shape is `(max_length, len(charset) + 1)`.
		The index of unknown character is `len(charset)`.
		"""
		# validation
		if (len(string) > self.max_length):
		logger.info(
		"The length of {} is longer than `max_length`. So we return an empty array."
		)
		return np.array([])

		if isinstance(self.max_length, int):
		if (len(string) > self.max_length): # Validation
		raise ValueError("The length of {} is longer than `max_length`.")
		string = self.pad_string(string) # Padding

		return np.array([
		one_hot_encode(val, self.charset, include_unknown_set=True)
		for val in string
		@@ -151,7 +157,10 @@ class OneHotFeaturizer(Featurizer):
		str
		String space padded to self.pad_length
		"""
		if isinstance(self.max_length, int):
		return string.ljust(self.max_length)
		else:
		return string

		def untransform(self, one_hot_vectors: np.ndarray) -> str:
		"""Convert from one hot representation back to original string

Admin message