Commit f67fb30f authored by Trent Hauck's avatar Trent Hauck
Browse files

Add more tests.

parent 5509c30d
Loading
Loading
Loading
Loading
+5 −3
Original line number Diff line number Diff line
@@ -9,6 +9,7 @@ __license__ = "MIT"

import os
import unittest

import deepchem as dc


@@ -26,7 +27,8 @@ class TestFASTALoader(unittest.TestCase):
                              "../../data/tests/example.fasta")
    loader = dc.data.FASTALoader()
    sequences = loader.featurize(input_file)

    # example.fasta contains 3 sequences each of length 58.
    # The one-hot encoding turns base-pairs into vectors of length 4.
    # There is one "image channel")
    assert sequences.X.shape == (3, 4, 58, 1)
    # The one-hot encoding turns base-pairs into vectors of length 5 (ATCGN).
    # There is one "image channel".
    assert sequences.X.shape == (3, 5, 58, 1)
+29 −7
Original line number Diff line number Diff line
@@ -110,14 +110,15 @@ def load_csv_files(filenames, shard_size=None, verbose=True):
def seq_one_hot_encode(sequences, letters='ATCGN'):
  """One hot encodes list of genomic sequences.

  Sequences encoded have shape (N_sequences, 4, sequence_length, 1).
  Here 4 is for the 4 basepairs (ACGT) present in genomic sequences.
  Sequences encoded have shape (N_sequences, N_letters, sequence_length, 1).
  These sequences will be processed as images with one color channel.

  Parameters
  ----------
  sequences: np.ndarray
    Array of genetic sequences
  letters: str
    String with the set of possible letters in the sequences.

  Raises
  ------
@@ -126,7 +127,7 @@ def seq_one_hot_encode(sequences, letters='ATCGN'):

  Returns
  -------
  np.ndarray: Shape (N_sequences, 4, sequence_length, 1).
  np.ndarray: Shape (N_sequences, N_letters, sequence_length, 1).
  """

  # The label encoder is given characters for ACGTN
@@ -168,9 +169,11 @@ def _seq_to_encoded(seq, letter_encoder, alphabet_length, sequence_length):
  return b


# This could just be ambiguous_dna_letters, but that would be much higher dim.
class IUPACUnambiguousDNAWithN(Alphabet):
  letters = unambiguous_dna_letters + "N"
  """
  A deepchem specific DNA Alphabet with the 4 main letters and N.
  """
  letters = "ACGTN"


def encode_fasta_sequence(fname):
@@ -181,6 +184,10 @@ def encode_fasta_sequence(fname):
  ----------
  fname: str
    Filename of fasta file.

  Returns
  -------
  np.ndarray: Shape (N_sequences, 4, sequence_length, 1).
  """

  dna_alphabet = IUPACUnambiguousDNAWithN()
@@ -188,12 +195,27 @@ def encode_fasta_sequence(fname):


def encode_bio_sequence(fname, file_type="fasta", alphabet=None):
  # np.ndarray: Shape (N_sequences, 4, sequence_length, 1).
  """
  Loads a sequence file and returns an array of one-hot sequences.

  Parameters
  ----------
  fname: str
    Filename of fasta file.
  file_type: str
    The type of file encoding to process, e.g. fasta or fastq, this
    is passed to Biopython.SeqIO.parse.
  alphabet: Bio.Alphabet.Alphabet
    A Biopython Alphabet, which should have a letter class property.

  Returns
  -------
  np.ndarray: Shape (N_sequences, N_letters, sequence_length, 1).
  """

  if alphabet is None:
    alphabet = IUPACUnambiguousDNAWithN()

  # TODO: if None, then get from the filename
  sequences = SeqIO.parse(fname, file_type, alphabet)
  return seq_one_hot_encode(sequences, alphabet.letters)

+2 −2
Original line number Diff line number Diff line
>seq0
AC
XY
>seq1
GA
ZX
+8 −0
Original line number Diff line number Diff line
@seq0
XY
+
hh
@seq1
ZX
+
hh
+33 −5
Original line number Diff line number Diff line
@@ -7,16 +7,28 @@ from __future__ import unicode_literals
__author__ = "Bharath Ramsundar"
__license__ = "MIT"

import numpy as np
import unittest
import os

import numpy as np
from Bio.Alphabet import Alphabet

import deepchem as dc


class MockAlphabet(Alphabet):
  letters = "XYZ"


class TestSeq(unittest.TestCase):
  """
  Tests sequence handling utilities.
  """

  def setUp(self):
    super(TestSeq, self).setUp()
    self.current_dir = os.path.dirname(os.path.abspath(__file__))

  def test_one_hot_simple(self):
    sequences = np.array(["ACGT", "GATA", "CGCG"])
    sequences = dc.utils.save.seq_one_hot_encode(sequences)
@@ -31,13 +43,29 @@ class TestSeq(unittest.TestCase):
      sequences = dc.utils.save.seq_one_hot_encode(sequences)

  def test_encode_fasta_sequence(self):
    fname = "./data/example.fasta"
    # Test it's possible to load a sequence with an aribrary alphabet from a fasta file.
    fname = os.path.join(self.current_dir, "./data/example.fasta")

    encoded_seqs = dc.utils.save.encode_bio_sequence(
        fname, alphabet=MockAlphabet())
    expected = np.expand_dims(
        np.array([
            [[1, 0], [0, 1], [0, 0]],
            [[0, 1], [0, 0], [1, 0]],
        ]), -1)

    np.testing.assert_array_equal(expected, encoded_seqs)

  def test_encode_fastq_sequence(self):
    fname = os.path.join(self.current_dir, "./data/example.fastq")

    encoded_seqs = dc.utils.save.encode_bio_sequence(
        fname, file_type="fastq", alphabet=MockAlphabet())

    encoded_seqs = dc.utils.save.encode_fasta_sequence(fname)
    expected = np.expand_dims(
        np.array([
            [[0, 0], [1, 0], [0, 0], [0, 1], [0, 0]],
            [[1, 0], [0, 1], [0, 0], [0, 0], [0, 0]],
            [[1, 0], [0, 1], [0, 0]],
            [[0, 1], [0, 0], [1, 0]],
        ]), -1)

    np.testing.assert_array_equal(expected, encoded_seqs)