Unverified Commit aaed65f7 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #2565 from alat-rights/FASTALoaderNew

Update FASTA Loader to accept arbitrary featurizers
parents a882ad8f 72f1c866
Loading
Loading
Loading
Loading
+115 −8
Original line number Diff line number Diff line
@@ -14,9 +14,10 @@ import numpy as np

from deepchem.utils.typing import OneOrMany
from deepchem.utils.data_utils import load_image_files, load_csv_files, load_json_files, load_sdf_files
from deepchem.utils.genomics_utils import encode_bio_sequence
from deepchem.feat import UserDefinedFeaturizer, Featurizer
from deepchem.data import Dataset, DiskDataset, NumpyDataset, ImageDataset
from deepchem.feat.molecule_featurizers import OneHotFeaturizer
from deepchem.utils.genomics_utils import encode_bio_sequence

logger = logging.getLogger(__name__)

@@ -875,9 +876,70 @@ class FASTALoader(DataLoader):
  learning tasks.
  """

  def __init__(self):
    """Initialize loader."""
    pass
  def __init__(self,
               featurizer: Optional[Featurizer] = None,
               auto_add_annotations: bool = False,
               legacy: bool = True):
    """Initialize FASTALoader.

    Parameters
    ----------
    featurizer: Featurizer (default: None)
      The Featurizer to be used for the loaded FASTA data.

      If featurizer is None and legacy is True, the original featurization
      logic is used, creating a one hot encoding of all included FASTA strings
      of shape
      (number of FASTA sequences, number of channels + 1, sequence length, 1).

      If featurizer is None and legacy is False, the featurizer is initialized
      as a OneHotFeaturizer object with charset ("A", "C", "T", "G") and
      max_length = None.

    auto_add_annotations: bool (default False)
      Whether create_dataset will automatically add [CLS] and [SEP] annotations
      to the sequences it reads in order to assist tokenization.
      Keep False if your FASTA file already includes [CLS] and [SEP] annotations.

    legacy: bool (default True)
      Whether to use legacy logic for featurization. Legacy mode will create
      a one hot encoding of the FASTA content of shape
      (number of FASTA sequences, number of channels + 1, max length, 1).

      Legacy mode is only tested for ACTGN charsets, and will be deprecated.
   """

    # Process legacy toggle
    if legacy:
      warnings.warn(
          """
                    Legacy mode is deprecated and will be removed in
                    DeepChem 3.0. Disable legacy mode by passing legacy=False
                    during construction of FASTALoader object.
                    """, FutureWarning)
      if featurizer is not None or auto_add_annotations:
        raise ValueError(f"""
                          featurizer option must be None and
                          auto_add_annotations must be false when legacy mode
                          is enabled. You set featurizer to {featurizer} and
                          auto_add_annotations to {auto_add_annotations}.
                          """)

    # Set attributes
    self.legacy = legacy
    self.auto_add_annotations = auto_add_annotations

    self.user_specified_features = None

    # Handle special featurizer cases
    if isinstance(featurizer, UserDefinedFeaturizer):  # User defined featurizer
      self.user_specified_features = featurizer.feature_fields
    elif featurizer is None:  # Default featurizer
      featurizer = OneHotFeaturizer(
          charset=["A", "C", "T", "G"], max_length=None)

    # Set self.featurizer
    self.featurizer = featurizer

  def create_dataset(self,
                     input_files: OneOrMany[str],
@@ -885,8 +947,7 @@ class FASTALoader(DataLoader):
                     shard_size: Optional[int] = None) -> DiskDataset:
    """Creates a `Dataset` from input FASTA files.

    At present, FASTA support is limited and only allows for one-hot
    featurization, and doesn't allow for sharding.
    At present, FASTA support is limited and doesn't allow for sharding.

    Parameters
    ----------
@@ -907,13 +968,59 @@ class FASTALoader(DataLoader):
    if isinstance(input_files, str):
      input_files = [input_files]

    def shard_generator():
    def shard_generator():  # TODO Enable sharding with shard size parameter
      for input_file in input_files:
        if self.legacy:
          X = encode_bio_sequence(input_file)
        else:
          sequences = _read_file(input_file)
          X = self.featurizer(sequences)
        ids = np.ones(len(X))
        # (X, y, w, ids)
        yield X, None, None, ids

    def _read_file(input_file: str, auto_add_annotations: bool = False):
      """
      Convert the FASTA file to a numpy array of FASTA-format strings.
      """

      # TODO don't convert all sequences into np array (allow shards)
      def _generate_sequences(fasta_file, header_mark=">") -> np.array:
        """
        Uses a fasta_file to create a numpy array of annotated FASTA-format strings
        """
        sequences = np.array([])
        sequence = np.array([])
        header_read = False
        for line in fasta_file:
          # Check if line is a header
          if line.startswith(header_mark):  # New header line
            header_read = True
            sequences = _add_sequence(sequences, sequence)
            sequence = np.array([])
          elif header_read:  # Line contains sequence in FASTA format
            if line[-1:] == '\n':  # Check last character in string
              line = line[0:-1]  # Remove last character
            sequence = np.append(sequence, line)
        sequences = _add_sequence(sequences, sequence)  # Add last sequence
        return sequences

      def _add_sequence(sequences: np.array, sequence: np.array) -> np.array:
        # Handle empty sequence
        if sequence is None or len(sequence) <= 0:
          # TODO log attempts to add empty sequences every shard
          return np.array([])
        # Annotate start/stop of sequence
        if auto_add_annotations:
          sequence = np.insert(sequence, 0, "[CLS]")
          sequence = np.append(sequence, "[SEP]")
        new_sequence = ''.join(sequence)
        new_sequences = np.append(sequences, new_sequence)
        return new_sequences

      with open(input_file, 'r') as f:  # Read FASTA file
        return _generate_sequences(f)

    return DiskDataset.create_dataset(shard_generator(), data_dir)


+29 −2
Original line number Diff line number Diff line
@@ -5,6 +5,7 @@ import os
import unittest

import deepchem as dc
from deepchem.feat.molecule_featurizers import OneHotFeaturizer


class TestFASTALoader(unittest.TestCase):
@@ -16,13 +17,39 @@ class TestFASTALoader(unittest.TestCase):
    super(TestFASTALoader, self).setUp()
    self.current_dir = os.path.dirname(os.path.abspath(__file__))

  def test_fasta_load(self):
  def test_legacy_fasta_one_hot(self):
    input_file = os.path.join(self.current_dir,
                              "../../data/tests/example.fasta")
    loader = dc.data.FASTALoader()
    loader = dc.data.FASTALoader(legacy=True)
    sequences = loader.create_dataset(input_file)

    # example.fasta contains 3 sequences each of length 58.
    # The one-hot encoding turns base-pairs into vectors of length 5 (ATCGN).
    # There is one "image channel".

    assert sequences.X.shape == (3, 5, 58, 1)

  def test_fasta_one_hot(self):
    input_file = os.path.join(self.current_dir,
                              "../../data/tests/example.fasta")
    loader = dc.data.FASTALoader(legacy=False)
    sequences = loader.create_dataset(input_file)

    # Due to FASTALoader redesign, expected shape is now (3, 58, 5)

    assert sequences.X.shape == (3, 58, 5)

  def test_fasta_one_hot_big(self):
    protein = [
        'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
        'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '*', '-'
    ]
    input_file = os.path.join(self.current_dir,
                              "../../data/tests/uniprot_truncated.fasta")
    loader = dc.data.FASTALoader(
        OneHotFeaturizer(charset=protein, max_length=1000), legacy=False)
    sequences = loader.create_dataset(input_file)

    assert sequences.X.shape

  # TODO: test with full uniprot file once sharding support is added.
+354 −0

File added.

Preview size limit exceeded, changes collapsed.

+27 −18
Original line number Diff line number Diff line
@@ -6,7 +6,7 @@ import numpy as np
from deepchem.utils.typing import RDKitMol
from deepchem.utils.molecule_feature_utils import one_hot_encode
from deepchem.feat.base_classes import Featurizer
from typing import Any, Iterable
from typing import Any, Iterable, Optional

logger = logging.getLogger(__name__)

@@ -32,21 +32,30 @@ class OneHotFeaturizer(Featurizer):
  It does not need RDKit to be installed to work with arbitrary strings.
  """

  def __init__(self, charset: List[str] = ZINC_CHARSET, max_length: int = 100):
  def __init__(self,
               charset: List[str] = ZINC_CHARSET,
               max_length: Optional[int] = 100):
    """Initialize featurizer.

    Parameters
    ----------
    charset: List[str], optional (default ZINC_CHARSET)
    charset: List[str] (default ZINC_CHARSET)
      A list of strings, where each string is length 1 and unique.
    max_length: int, optional (default 100)
      The max length for SMILES string. If the length of SMILES string is
      shorter than max_length, the SMILES is padded using space.
    max_length: Optional[int], optional (default 100)
      The max length for string. If the length of string is shorter than
      max_length, the string is padded using space.

      If max_length is None, no padding is performed and arbitrary length
      strings are allowed.
    """
    if len(charset) != len(set(charset)):
      raise ValueError("All values in charset must be unique.")
    self.charset = charset
    self.max_length = max_length
    self.max_length = Optional[int]
    if max_length is not None:
      self.max_length = int(max_length)
    else:
      self.max_length = None

  def featurize(self, datapoints: Iterable[Any],
                log_every_n: int = 1000) -> np.ndarray:
@@ -55,19 +64,19 @@ class OneHotFeaturizer(Featurizer):
    Parameters
    ----------
    datapoints: list
      A list of either strings or RDKit molecules.
      A list of either strings (str or numpy.str_) or RDKit molecules.
    log_every_n: int, optional (default 1000)
      How many elements are featurized every time a featurization is logged.
    """
    datapoints = list(datapoints)
    if (len(datapoints) < 1):
      return np.array([])
    # Featurize data using featurize() in grandparent class
    # Featurize data using featurize() in parent class
    return Featurizer.featurize(self, datapoints, log_every_n)

  def _featurize(self, datapoint: Any):
    # Featurize str data
    if (type(datapoint) == str):
    if isinstance(datapoint, (str, np.str_)):
      return self._featurize_string(datapoint)
    # Featurize mol data
    else:
@@ -88,14 +97,11 @@ class OneHotFeaturizer(Featurizer):
      The shape is `(max_length, len(charset) + 1)`.
      The index of unknown character is `len(charset)`.
    """
    # validation
    if (len(string) > self.max_length):
      logger.info(
          "The length of {} is longer than `max_length`. So we return an empty array."
      )
      return np.array([])

    if isinstance(self.max_length, int):
      if (len(string) > self.max_length):  # Validation
        raise ValueError("The length of {} is longer than `max_length`.")
      string = self.pad_string(string)  # Padding

    return np.array([
        one_hot_encode(val, self.charset, include_unknown_set=True)
        for val in string
@@ -151,7 +157,10 @@ class OneHotFeaturizer(Featurizer):
    str
      String space padded to self.pad_length
    """
    if isinstance(self.max_length, int):
      return string.ljust(self.max_length)
    else:
      return string

  def untransform(self, one_hot_vectors: np.ndarray) -> str:
    """Convert from one hot representation back to original string