Add more tests. (f67fb30f) · Commits · 钟慕尧 / deepchem

deepchem/data/tests/test_fasta_loader.py

+5 −3

Original line number	Diff line number	Diff line
		@@ -9,6 +9,7 @@ __license__ = "MIT"

		import os
		import unittest

		import deepchem as dc


		@@ -26,7 +27,8 @@ class TestFASTALoader(unittest.TestCase):
		"../../data/tests/example.fasta")
		loader = dc.data.FASTALoader()
		sequences = loader.featurize(input_file)

		# example.fasta contains 3 sequences each of length 58.
		# The one-hot encoding turns base-pairs into vectors of length 4.
		# There is one "image channel")
		assert sequences.X.shape == (3, 4, 58, 1)
		# The one-hot encoding turns base-pairs into vectors of length 5 (ATCGN).
		# There is one "image channel".
		assert sequences.X.shape == (3, 5, 58, 1)

deepchem/utils/save.py

+29 −7

Original line number	Diff line number	Diff line
		@@ -110,14 +110,15 @@ def load_csv_files(filenames, shard_size=None, verbose=True):
		def seq_one_hot_encode(sequences, letters='ATCGN'):
		"""One hot encodes list of genomic sequences.

		Sequences encoded have shape (N_sequences, 4, sequence_length, 1).
		Here 4 is for the 4 basepairs (ACGT) present in genomic sequences.
		Sequences encoded have shape (N_sequences, N_letters, sequence_length, 1).
		These sequences will be processed as images with one color channel.

		Parameters
		----------
		sequences: np.ndarray
		Array of genetic sequences
		letters: str
		String with the set of possible letters in the sequences.

		Raises
		------
		@@ -126,7 +127,7 @@ def seq_one_hot_encode(sequences, letters='ATCGN'):

		Returns
		-------
		np.ndarray: Shape (N_sequences, 4, sequence_length, 1).
		np.ndarray: Shape (N_sequences, N_letters, sequence_length, 1).
		"""

		# The label encoder is given characters for ACGTN
		@@ -168,9 +169,11 @@ def _seq_to_encoded(seq, letter_encoder, alphabet_length, sequence_length):
		return b


		# This could just be ambiguous_dna_letters, but that would be much higher dim.
		class IUPACUnambiguousDNAWithN(Alphabet):
		letters = unambiguous_dna_letters + "N"
		"""
		A deepchem specific DNA Alphabet with the 4 main letters and N.
		"""
		letters = "ACGTN"


		def encode_fasta_sequence(fname):
		@@ -181,6 +184,10 @@ def encode_fasta_sequence(fname):
		----------
		fname: str
		Filename of fasta file.

		Returns
		-------
		np.ndarray: Shape (N_sequences, 4, sequence_length, 1).
		"""

		dna_alphabet = IUPACUnambiguousDNAWithN()
		@@ -188,12 +195,27 @@ def encode_fasta_sequence(fname):


		def encode_bio_sequence(fname, file_type="fasta", alphabet=None):
		# np.ndarray: Shape (N_sequences, 4, sequence_length, 1).
		"""
		Loads a sequence file and returns an array of one-hot sequences.

		Parameters
		----------
		fname: str
		Filename of fasta file.
		file_type: str
		The type of file encoding to process, e.g. fasta or fastq, this
		is passed to Biopython.SeqIO.parse.
		alphabet: Bio.Alphabet.Alphabet
		A Biopython Alphabet, which should have a letter class property.

		Returns
		-------
		np.ndarray: Shape (N_sequences, N_letters, sequence_length, 1).
		"""

		if alphabet is None:
		alphabet = IUPACUnambiguousDNAWithN()

		# TODO: if None, then get from the filename
		sequences = SeqIO.parse(fname, file_type, alphabet)
		return seq_one_hot_encode(sequences, alphabet.letters)

deepchem/utils/test/data/example.fasta

+2 −2

Original line number	Diff line number	Diff line
		>seq0
		AC
		XY
		>seq1
		GA
		ZX

deepchem/utils/test/data/example.fastq

0 → 100644

+8 −0

Original line number	Diff line number	Diff line
		@seq0
		XY
		+
		hh
		@seq1
		ZX
		+
		hh

deepchem/utils/test/test_seq.py

+33 −5

Original line number	Diff line number	Diff line
		@@ -7,16 +7,28 @@ from __future__ import unicode_literals
		__author__ = "Bharath Ramsundar"
		__license__ = "MIT"

		import numpy as np
		import unittest
		import os

		import numpy as np
		from Bio.Alphabet import Alphabet

		import deepchem as dc


		class MockAlphabet(Alphabet):
		letters = "XYZ"


		class TestSeq(unittest.TestCase):
		"""
		Tests sequence handling utilities.
		"""

		def setUp(self):
		super(TestSeq, self).setUp()
		self.current_dir = os.path.dirname(os.path.abspath(__file__))

		def test_one_hot_simple(self):
		sequences = np.array(["ACGT", "GATA", "CGCG"])
		sequences = dc.utils.save.seq_one_hot_encode(sequences)
		@@ -31,13 +43,29 @@ class TestSeq(unittest.TestCase):
		sequences = dc.utils.save.seq_one_hot_encode(sequences)

		def test_encode_fasta_sequence(self):
		fname = "./data/example.fasta"
		# Test it's possible to load a sequence with an aribrary alphabet from a fasta file.
		fname = os.path.join(self.current_dir, "./data/example.fasta")

		encoded_seqs = dc.utils.save.encode_bio_sequence(
		fname, alphabet=MockAlphabet())
		expected = np.expand_dims(
		np.array([
		[[1, 0], [0, 1], [0, 0]],
		[[0, 1], [0, 0], [1, 0]],
		]), -1)

		np.testing.assert_array_equal(expected, encoded_seqs)

		def test_encode_fastq_sequence(self):
		fname = os.path.join(self.current_dir, "./data/example.fastq")

		encoded_seqs = dc.utils.save.encode_bio_sequence(
		fname, file_type="fastq", alphabet=MockAlphabet())

		encoded_seqs = dc.utils.save.encode_fasta_sequence(fname)
		expected = np.expand_dims(
		np.array([
		[[0, 0], [1, 0], [0, 0], [0, 1], [0, 0]],
		[[1, 0], [0, 1], [0, 0], [0, 0], [0, 0]],
		[[1, 0], [0, 1], [0, 0]],
		[[0, 1], [0, 0], [1, 0]],
		]), -1)

		np.testing.assert_array_equal(expected, encoded_seqs)

Admin message