Unverified Commit 8f329dff authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #1020 from rbharath/fasta

Implementation of FASTALoader
parents 0e2cb8d9 0875273d
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -18,4 +18,5 @@ from deepchem.data.data_loader import DataLoader
from deepchem.data.data_loader import CSVLoader
from deepchem.data.data_loader import UserCSVLoader
from deepchem.data.data_loader import SDFLoader
from deepchem.data.data_loader import FASTALoader
import deepchem.data.tests
+47 −1
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@ import sys
from deepchem.utils.save import log
from deepchem.utils.save import load_csv_files
from deepchem.utils.save import load_sdf_files
from deepchem.utils.save import encode_fasta_sequence
from deepchem.feat import UserDefinedFeaturizer
from deepchem.data import DiskDataset

@@ -186,7 +187,20 @@ class DataLoader(object):
    self.log_every_n = log_every_n

  def featurize(self, input_files, data_dir=None, shard_size=8192):
    """Featurize provided files and write to specified location."""
    """Featurize provided files and write to specified location.
    
    For large datasets, automatically shards into smaller chunks
    for convenience.

    Parameters
    ----------
    input_files: list
      List of input filenames.
    data_dir: str
      (Optional) Directory to store featurized dataset.
    shard_size: int
      (Optional) Number of examples stored in each shard.
    """
    log("Loading raw samples now.", self.verbose)
    log("shard_size: %d" % shard_size, self.verbose)

@@ -280,3 +294,35 @@ class SDFLoader(DataLoader):
    log("Currently featurizing feature_type: %s" %
        self.featurizer.__class__.__name__, self.verbose)
    return featurize_mol_df(shard, self.featurizer, field=self.mol_field)


class FASTALoader(DataLoader):
  """
  Handles loading of FASTA files.
  """

  def __init__(self, verbose=True):
    """Initialize loader."""
    self.verbose = verbose

  def featurize(self, input_files, data_dir=None):
    """Featurizes fasta files.

    Parameters
    ----------
    input_files: list
      List of fasta files.
    data_dir: str
      (Optional) Name of directory where featurized data is stored.
    """
    if not isinstance(input_files, list):
      input_files = [input_files]

    def shard_generator():
      for input_file in input_files:
        X = encode_fasta_sequence(input_file)
        ids = np.ones(len(X))
        # (X, y, w, ids)
        yield X, None, None, ids

    return DiskDataset.create_dataset(shard_generator(), data_dir)
+6 −0
Original line number Diff line number Diff line
>seq0
ACGTCCCACACGATGCATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGAT
>seq1
GTCGATGCATGCTAGCTAGCTAGCTAGCTACGATCGATCGATCGTACGATCGATCGAT
>seq2
ACACATCATCATTACTATATATTATATATCGATCGATCGATCGATCGTACGTAGCTAG
+33 −0
Original line number Diff line number Diff line
"""
Tests that FASTA files can be loaded.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

__author__ = "Bharath Ramsundar"
__license__ = "MIT"

import os
import unittest
import deepchem as dc


class TestFASTALoader(unittest.TestCase):
  """
  Test FASTALoader 
  """

  def setUp(self):
    super(TestFASTALoader, self).setUp()
    self.current_dir = os.path.dirname(os.path.abspath(__file__))

  def test_fasta_load(self):
    input_file = os.path.join(self.current_dir,
                              "../../data/tests/example.fasta")
    loader = dc.data.FASTALoader()
    sequences = loader.featurize(input_file)
    # example.fasta contains 3 sequences each of length 58.
    # The one-hot encoding turns base-pairs into vectors of length 4.
    # There is one "image channel")
    assert sequences.X.shape == (3, 4, 58, 1)
+75 −0
Original line number Diff line number Diff line
@@ -8,12 +8,14 @@ from __future__ import unicode_literals
# TODO(rbharath): Use standard joblib once old-data has been regenerated.
import joblib
from sklearn.externals import joblib as old_joblib
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import gzip
import json
import pickle
import pandas as pd
import numpy as np
import os
import sys
import deepchem
from rdkit import Chem

@@ -104,6 +106,79 @@ def load_csv_files(filenames, shard_size=None, verbose=True):
        yield df


def seq_one_hot_encode(sequences):
  """One hot encodes list of genomic sequences.

  Sequences encoded have shape (N_sequences, 4, sequence_length, 1).
  Here 4 is for the 4 basepairs (ACGT) present in genomic sequences.
  These sequences will be processed as images with one color channel.

  Parameters
  ----------
  sequences: np.ndarray 
    Array of genetic sequences 

  Raises
  ------
  ValueError:
    If sequences are of different lengths.

  Returns
  -------
  np.ndarray: Shape (N_sequences, 4, sequence_length, 1).
  """
  sequence_length = len(sequences[0])
  # depends on Python version
  integer_type = np.int32
  # The label encoder is given characters for ACGTN
  label_encoder = LabelEncoder().fit(np.array(('ACGTN',)).view(integer_type))
  # These are transformed in 0, 1, 2, 3, 4 in input sequence
  integer_array = []
  # TODO(rbharath): Unlike the DRAGONN implementation from which this
  # was ported, I couldn't transform the "ACGT..." strings into
  # integers all at once. Had to do one at a time. Might be worth
  # figuring out what's going on under the hood.
  for sequence in sequences:
    if len(sequence) != sequence_length:
      raise ValueError("All sequences must be of same length")
    integer_seq = label_encoder.transform(
        np.array((sequence,)).view(integer_type))
    integer_array.append(integer_seq)
  integer_array = np.concatenate(integer_array)
  integer_array = integer_array.reshape(len(sequences), sequence_length)
  one_hot_encoding = OneHotEncoder(
      sparse=False, n_values=5, dtype=integer_type).fit_transform(integer_array)

  return one_hot_encoding.reshape(len(sequences), sequence_length, 5,
                                  1).swapaxes(1, 2)[:, [0, 1, 2, 4], :, :]


def encode_fasta_sequence(fname):
  """
  Loads fasta file and returns an array of one-hot sequences.

  Parameters
  ----------
  fname: str
    Filename of fasta file.
  """
  name, seq_chars = None, []
  sequences = []
  with open(fname) as fp:
    for line in fp:
      line = line.rstrip()
      if line.startswith(">"):
        if name:
          sequences.append(''.join(seq_chars).upper())
        name, seq_chars = line, []
      else:
        seq_chars.append(line)
  if name is not None:
    sequences.append(''.join(seq_chars).upper())

  return seq_one_hot_encode(np.array(sequences))


def save_metadata(tasks, metadata_df, data_dir):
  """
  Saves the metadata for a DiskDataset
Loading