Commit 238935e0 authored by Trent Hauck's avatar Trent Hauck
Browse files

Path 1

parent 1b171da6
Loading
Loading
Loading
Loading
+41 −1
Original line number Diff line number Diff line
@@ -15,9 +15,11 @@ import pickle
import pandas as pd
import numpy as np
import os
import sys
import deepchem
from rdkit import Chem
from Bio.Data.IUPACData import unambiguous_dna_letters
from Bio.Alphabet import Alphabet
from Bio import SeqIO


def log(string, verbose=True):
@@ -178,6 +180,44 @@ def encode_fasta_sequence(fname):

  return seq_one_hot_encode(np.array(sequences))

# This could just be ambiguous_dna_letters, but that would be much higher dim.
class IUPACUnambiguousDNAWithN(Alphabet):
    letters = unambiguous_dna_letters + "N"

def encode_sequence_with_biopython(fname, file_type="fasta", alphabet=None):
    # np.ndarray: Shape (N_sequences, 4, sequence_length, 1).

    if alphabet is None:
        alphabet = IUPACUnambiguousDNAWithN()

    # TODO: if None, then get from the filename
    sequences = SeqIO.parse(fname, file_type, alphabet)

    # The label encoder is given characters for ACGTN
    letter_encoder = {l: i for i, l in enumerate(alphabet.letters)}
    alphabet_length = len(letter_encoder)

    # Peak at the first sequence to get the length of the sequence.
    first_seq = next(sequences)
    sequence_length = len(first_seq.seq)

    seqs = []

    seqs.append(_seq_to_encoded(first_seq, letter_encoder, alphabet_length, sequence_length))

    for other_seq in sequences:
      seqs.append(_seq_to_encoded(other_seq, letter_encoder, alphabet_length, sequence_length))

    # return np.expand_dims(np.array(seqs), -1)
    return np.array(seqs)

def _seq_to_encoded(seq, letter_encoder, alphabet_length, sequence_length):
    b = np.zeros((alphabet_length, sequence_length))
    seq_ints = [letter_encoder[s] for s in seq]
    b[seq_ints, np.arange(sequence_length)] = 1

    return b


def save_metadata(tasks, metadata_df, data_dir):
  """
+4 −0
Original line number Diff line number Diff line
>seq0
AC
>seq1
GA
+11 −0
Original line number Diff line number Diff line
@@ -32,3 +32,14 @@ class TestSeq(unittest.TestCase):
    except ValueError:
      thrown = True
    assert thrown

  def test_encode_sequence_with_biopython(self):
      fname = "./data/example.fasta"

      encoded_seqs = dc.utils.save.encode_sequence_with_biopython(fname)
      expected = np.array([
        [[0, 0], [1, 0], [0, 0], [0, 1], [0, 0]],
        [[1, 0], [0, 1], [0, 0], [0, 0], [0, 0]],
      ])

      np.testing.assert_array_equal(expected, encoded_seqs)
+1 −0
Original line number Diff line number Diff line
@@ -47,4 +47,5 @@ conda install -y -q -c conda-forge jupyter=1.0.0
conda install -y -q -c conda-forge pbr=3.1.1
conda install -y -q -c rdkit rdkit=2017.09.1
conda install -y -q -c conda-forge setuptools=39.0.1
conda install -y -q -c conda-forge biopython=1.71
yes | pip install $tensorflow==1.6.0