Soft import Biopython, trim usage. (b213f036) · Commits · 钟慕尧 / deepchem

deepchem/utils/save.py

+8 −20

Original line number	Diff line number	Diff line
		@@ -16,9 +16,6 @@ import numpy as np
		import os
		import deepchem
		from rdkit import Chem
		from Bio.Data.IUPACData import unambiguous_dna_letters
		from Bio.Alphabet import Alphabet
		from Bio import SeqIO


		def log(string, verbose=True):
		@@ -169,13 +166,6 @@ def _seq_to_encoded(seq, letter_encoder, alphabet_length, sequence_length):
		return b


		class IUPACUnambiguousDNAWithN(Alphabet):
		"""
		A deepchem specific DNA Alphabet with the 4 main letters and N.
		"""
		letters = "ACGTN"


		def encode_fasta_sequence(fname):
		"""
		Loads fasta file and returns an array of one-hot sequences.
		@@ -187,14 +177,13 @@ def encode_fasta_sequence(fname):

		Returns
		-------
		np.ndarray: Shape (N_sequences, 4, sequence_length, 1).
		np.ndarray: Shape (N_sequences, 5, sequence_length, 1).
		"""

		dna_alphabet = IUPACUnambiguousDNAWithN()
		return encode_bio_sequence(fname, "fasta", dna_alphabet)
		return encode_bio_sequence(fname)


		def encode_bio_sequence(fname, file_type="fasta", alphabet=None):
		def encode_bio_sequence(fname, file_type="fasta", letters="ATCGN"):
		"""
		Loads a sequence file and returns an array of one-hot sequences.

		@@ -205,19 +194,18 @@ def encode_bio_sequence(fname, file_type="fasta", alphabet=None):
		file_type: str
		The type of file encoding to process, e.g. fasta or fastq, this
		is passed to Biopython.SeqIO.parse.
		alphabet: Bio.Alphabet.Alphabet
		A Biopython Alphabet, which should have a letter class property.
		letters: str
		The set of letters that the sequences consist of, e.g. ATCG.

		Returns
		-------
		np.ndarray: Shape (N_sequences, N_letters, sequence_length, 1).
		"""

		if alphabet is None:
		alphabet = IUPACUnambiguousDNAWithN()
		from Bio import SeqIO

		sequences = SeqIO.parse(fname, file_type, alphabet)
		return seq_one_hot_encode(sequences, alphabet.letters)
		sequences = SeqIO.parse(fname, file_type)
		return seq_one_hot_encode(sequences, letters)


		def save_metadata(tasks, metadata_df, data_dir):

deepchem/utils/test/test_seq.py

+3 −6

Original line number	Diff line number	Diff line
		@@ -15,9 +15,7 @@ from Bio.Alphabet import Alphabet

		import deepchem as dc


		class MockAlphabet(Alphabet):
		letters = "XYZ"
		LETTERS = "XYZ"


		class TestSeq(unittest.TestCase):
		@@ -46,8 +44,7 @@ class TestSeq(unittest.TestCase):
		# Test it's possible to load a sequence with an aribrary alphabet from a fasta file.
		fname = os.path.join(self.current_dir, "./data/example.fasta")

		encoded_seqs = dc.utils.save.encode_bio_sequence(
		fname, alphabet=MockAlphabet())
		encoded_seqs = dc.utils.save.encode_bio_sequence(fname, letters=LETTERS)
		expected = np.expand_dims(
		np.array([
		[[1, 0], [0, 1], [0, 0]],
		@@ -60,7 +57,7 @@ class TestSeq(unittest.TestCase):
		fname = os.path.join(self.current_dir, "./data/example.fastq")

		encoded_seqs = dc.utils.save.encode_bio_sequence(
		fname, file_type="fastq", alphabet=MockAlphabet())
		fname, file_type="fastq", letters=LETTERS)

		expected = np.expand_dims(
		np.array([

Admin message