Unverified Commit 29bcf0fb authored by Karl Leswing's avatar Karl Leswing Committed by GitHub
Browse files

Merge pull request #1420 from vsag96/Fix-1318

WIP:Move genome Utility functions to genomics.py
parents 41bbe499 97cdc811
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -18,7 +18,7 @@ import sys
from deepchem.utils.save import log
from deepchem.utils.save import load_csv_files
from deepchem.utils.save import load_sdf_files
from deepchem.utils.save import encode_fasta_sequence
from deepchem.utils.genomics import encode_fasta_sequence
from deepchem.feat import UserDefinedFeaturizer
from deepchem.data import DiskDataset
from deepchem.data import NumpyDataset
+4 −4
Original line number Diff line number Diff line
@@ -27,7 +27,7 @@ class TestGenomicMetrics(unittest.TestCase):
    # Encode motif
    motif_name = "TAL1_known4"
    sequences = np.array(["ACGTA", "GATAG", "CGCGC"])
    sequences = dc.utils.save.seq_one_hot_encode(sequences, letters=LETTERS)
    sequences = dc.utils.genomics.seq_one_hot_encode(sequences, letters=LETTERS)
    # sequences now has shape (3, 4, 5, 1)
    self.assertEqual(sequences.shape, (3, 4, 5, 1))

@@ -38,7 +38,7 @@ class TestGenomicMetrics(unittest.TestCase):
    """Test get_pssm_scores returns correct shape."""
    motif_name = "TAL1_known4"
    sequences = np.array(["ACGTA", "GATAG", "CGCGC"])
    sequences = dc.utils.save.seq_one_hot_encode(sequences, letters=LETTERS)
    sequences = dc.utils.genomics.seq_one_hot_encode(sequences, letters=LETTERS)
    # sequences now has shape (3, 4, 5, 1)
    self.assertEqual(sequences.shape, (3, 4, 5, 1))
    pssm = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]])
@@ -50,7 +50,7 @@ class TestGenomicMetrics(unittest.TestCase):
    """Test in-silico mutagenesis returns correct shape."""
    # Construct and train SequenceDNN model
    sequences = np.array(["ACGTA", "GATAG", "CGCGC"])
    sequences = dc.utils.save.seq_one_hot_encode(sequences, letters=LETTERS)
    sequences = dc.utils.genomics.seq_one_hot_encode(sequences, letters=LETTERS)
    labels = np.array([1, 0, 0])
    labels = np.reshape(labels, (3, 1))
    self.assertEqual(sequences.shape, (3, 4, 5, 1))
@@ -71,7 +71,7 @@ class TestGenomicMetrics(unittest.TestCase):
    """Test in-silico mutagenesis returns nonzero output."""
    # Construct and train SequenceDNN model
    sequences = np.array(["ACGTA", "GATAG", "CGCGC"])
    sequences = dc.utils.save.seq_one_hot_encode(sequences, letters=LETTERS)
    sequences = dc.utils.genomics.seq_one_hot_encode(sequences, letters=LETTERS)
    labels = np.array([1, 0, 0])
    labels = np.reshape(labels, (3, 1))
    self.assertEqual(sequences.shape, (3, 4, 5, 1))
+105 −0
Original line number Diff line number Diff line
@@ -3,6 +3,111 @@ Genomic data handling utilities.
"""
import simdna
from simdna.synthetic import LoadedEncodeMotifs
import numpy as np

loaded_motifs = LoadedEncodeMotifs(
    simdna.ENCODE_MOTIFS_PATH, pseudocountProb=0.001)


def seq_one_hot_encode(sequences, letters='ATCGN'):
  """One hot encodes list of genomic sequences.

  Sequences encoded have shape (N_sequences, N_letters, sequence_length, 1).
  These sequences will be processed as images with one color channel.

  Parameters
  ----------
  sequences: np.ndarray
    Array of genetic sequences
  letters: str
    String with the set of possible letters in the sequences.

  Raises
  ------
  ValueError:
    If sequences are of different lengths.

  Returns
  -------
  np.ndarray: Shape (N_sequences, N_letters, sequence_length, 1).
  """

  # The label encoder is given characters for ACGTN
  letter_encoder = {l: i for i, l in enumerate(letters)}
  alphabet_length = len(letter_encoder)

  # Peak at the first sequence to get the length of the sequence.
  try:
    first_seq = next(sequences)
    tail_seq = sequences
  except TypeError:
    first_seq = sequences[0]
    tail_seq = sequences[1:]

  sequence_length = len(first_seq)

  seqs = []

  seqs.append(
      _seq_to_encoded(first_seq, letter_encoder, alphabet_length,
                      sequence_length))

  for other_seq in tail_seq:
    if len(other_seq) != sequence_length:
      raise ValueError

    seqs.append(
        _seq_to_encoded(other_seq, letter_encoder, alphabet_length,
                        sequence_length))

  return np.expand_dims(np.array(seqs), -1)


def _seq_to_encoded(seq, letter_encoder, alphabet_length, sequence_length):
  b = np.zeros((alphabet_length, sequence_length))
  seq_ints = [letter_encoder[s] for s in seq]
  b[seq_ints, np.arange(sequence_length)] = 1

  return b


def encode_fasta_sequence(fname):
  """
  Loads fasta file and returns an array of one-hot sequences.

  Parameters
  ----------
  fname: str
    Filename of fasta file.

  Returns
  -------
  np.ndarray: Shape (N_sequences, 5, sequence_length, 1).
  """

  return encode_bio_sequence(fname)


def encode_bio_sequence(fname, file_type="fasta", letters="ATCGN"):
  """
  Loads a sequence file and returns an array of one-hot sequences.

  Parameters
  ----------
  fname: str
    Filename of fasta file.
  file_type: str
    The type of file encoding to process, e.g. fasta or fastq, this
    is passed to Biopython.SeqIO.parse.
  letters: str
    The set of letters that the sequences consist of, e.g. ATCG.

  Returns
  -------
  np.ndarray: Shape (N_sequences, N_letters, sequence_length, 1).
  """

  from Bio import SeqIO

  sequences = SeqIO.parse(fname, file_type)
  return seq_one_hot_encode(sequences, letters)
+14 −44
Original line number Diff line number Diff line
@@ -16,6 +16,8 @@ import numpy as np
import os
import deepchem
from rdkit import Chem
import warnings
from deepchem.utils.genomics import encode_bio_sequence as encode_sequence, encode_fasta_sequence as fasta_sequence, seq_one_hot_encode as seq_one_hotencode


def log(string, verbose=True):
@@ -126,44 +128,10 @@ def seq_one_hot_encode(sequences, letters='ATCGN'):
  -------
  np.ndarray: Shape (N_sequences, N_letters, sequence_length, 1).
  """

  # The label encoder is given characters for ACGTN
  letter_encoder = {l: i for i, l in enumerate(letters)}
  alphabet_length = len(letter_encoder)

  # Peak at the first sequence to get the length of the sequence.
  try:
    first_seq = next(sequences)
    tail_seq = sequences
  except TypeError:
    first_seq = sequences[0]
    tail_seq = sequences[1:]

  sequence_length = len(first_seq)

  seqs = []

  seqs.append(
      _seq_to_encoded(first_seq, letter_encoder, alphabet_length,
                      sequence_length))

  for other_seq in tail_seq:
    if len(other_seq) != sequence_length:
      raise ValueError

    seqs.append(
        _seq_to_encoded(other_seq, letter_encoder, alphabet_length,
                        sequence_length))

  return np.expand_dims(np.array(seqs), -1)


def _seq_to_encoded(seq, letter_encoder, alphabet_length, sequence_length):
  b = np.zeros((alphabet_length, sequence_length))
  seq_ints = [letter_encoder[s] for s in seq]
  b[seq_ints, np.arange(sequence_length)] = 1

  return b
  warnings.warn(
      "This Function has been deprecated and now resides in deepchem.utils.genomics ",
      DeprecationWarning)
  return seq_one_hotencode(sequences, letters=letters)


def encode_fasta_sequence(fname):
@@ -179,8 +147,11 @@ def encode_fasta_sequence(fname):
  -------
  np.ndarray: Shape (N_sequences, 5, sequence_length, 1).
  """
  warnings.warn(
      "This Function has been deprecated and now resides in deepchem.utils.genomics",
      DeprecationWarning)

  return encode_bio_sequence(fname)
  return fasta_sequence(fname)


def encode_bio_sequence(fname, file_type="fasta", letters="ATCGN"):
@@ -201,11 +172,10 @@ def encode_bio_sequence(fname, file_type="fasta", letters="ATCGN"):
  -------
  np.ndarray: Shape (N_sequences, N_letters, sequence_length, 1).
  """

  from Bio import SeqIO

  sequences = SeqIO.parse(fname, file_type)
  return seq_one_hot_encode(sequences, letters)
  warnings.warn(
      "This Function has been deprecated and now resides in deepchem.utils.genomics ",
      DeprecationWarning)
  return encode_sequence(fname, file_type=file_type, letters=letters)


def save_metadata(tasks, metadata_df, data_dir):