Commit b07e478e authored by Haowen Zhang's avatar Haowen Zhang Committed by Li Song
Browse files

Make more alignment related functions free.

parent ffc9191f
Loading
Loading
Loading
Loading
+109 −0
Original line number Diff line number Diff line
@@ -3,6 +3,7 @@

#include <smmintrin.h>

#include "sam_mapping.h"
#include "sequence_batch.h"

namespace chromap {
@@ -25,6 +26,114 @@ int GetLongestMatchLength(const char *pattern, const char *text,
  return max_match;
}

// return: newly adjust reference start/end position (kPositive for start,
// kNegative for end)
int AdjustGapBeginning(Direction mapping_direction, const char *ref,
                       const char *read, int *gap_beginning, int read_end,
                       int ref_start_position, int ref_end_position,
                       int *n_cigar, uint32_t **cigar) {
  int i, j;
  if (mapping_direction == kPositive) {
    if (*gap_beginning <= 0) {
      return ref_start_position;
    }
    // printf("%d\n", *gap_beginning);
    for (i = *gap_beginning - 1, j = ref_start_position - 1; i >= 0 && j >= 0;
         --i, --j) {
      // printf("%c %c\n", read[i], ref[j]);
      if (read[i] != ref[j] && read[i] != ref[j] - 'a' + 'A') {
        break;
      }
    }
    *gap_beginning = i + 1;
    // TODO: add soft clip in cigar
    if (n_cigar && *n_cigar > 0) {
      if (((*cigar)[0] & 0xf) == BAM_CMATCH) {
        (*cigar)[0] += (ref_start_position - 1 - j) << 4;
      }
    }
    return j + 1;
  } else {
    if (*gap_beginning <= 0) {
      return ref_end_position;
    }
    // printf("%d\n", *gap_beginning);
    /*char *tmp = new char[255] ;
    strncpy(tmp, ref + ref_start_position, ref_end_position - ref_start_position
    + 1 + 10) ; printf("%s %d. %d %d\n", tmp, strlen(tmp), ref_end_position -
    ref_start_position + 1 + 10, strlen(ref)) ; delete[] tmp;*/
    for (i = read_end + 1, j = ref_end_position + 1; read[i] && ref[j];
         ++i, ++j) {
      // printf("%c %c %c %c %c %c\n", read[i], ref[j - 1], ref[j], ref[j + 1],
      // ref[j + 2], ref[j + 3]);
      if (read[i] != ref[j] && read[i] != ref[j] - 'a' + 'A') {
        break;
      }
    }
    *gap_beginning = *gap_beginning + i - (read_end + 1);
    if (n_cigar && *n_cigar > 0) {
      if (((*cigar)[*n_cigar - 1] & 0xf) == BAM_CMATCH) {
        (*cigar)[*n_cigar - 1] += (j - (ref_end_position + 1)) << 4;
      }
    }

    return j - 1;
  }
}

void GenerateMDTag(const char *pattern, const char *text,
                   int mapping_start_position, int n_cigar,
                   const uint32_t *cigar, int &NM, std::string &MD_tag) {
  int num_matches = 0;
  const char *read = text;
  const char *reference = pattern + mapping_start_position;
  int read_position = 0;
  int reference_position = 0;
  for (int ci = 0; ci < n_cigar; ++ci) {
    uint32_t current_cigar_uint = cigar[ci];
    uint8_t cigar_operation = bam_cigar_op(current_cigar_uint);
    int num_cigar_operations = bam_cigar_oplen(current_cigar_uint);
    if (cigar_operation == BAM_CMATCH) {
      for (int opi = 0; opi < num_cigar_operations; ++opi) {
        if (reference[reference_position] == read[read_position] ||
            reference[reference_position] - 'a' + 'A' == read[read_position]) {
          // a match
          ++num_matches;
        } else {
          // a mismatch
          ++NM;
          if (num_matches != 0) {
            MD_tag.append(std::to_string(num_matches));
            num_matches = 0;
          }
          MD_tag.push_back(reference[reference_position]);
        }
        ++reference_position;
        ++read_position;
      }
    } else if (cigar_operation == BAM_CINS) {
      NM += num_cigar_operations;
      read_position += num_cigar_operations;
    } else if (cigar_operation == BAM_CDEL) {
      NM += num_cigar_operations;
      if (num_matches != 0) {
        MD_tag.append(std::to_string(num_matches));
        num_matches = 0;
      }
      MD_tag.push_back('^');
      for (int opi = 0; opi < num_cigar_operations; ++opi) {
        MD_tag.push_back(reference[reference_position]);
        ++reference_position;
      }
    } else {
      std::cerr << "Unexpected cigar op: " << (int)cigar_operation << "\n";
    }
  }
  if (num_matches != 0) {
    MD_tag.append(std::to_string(num_matches));
  }
}

int BandedAlignPatternToText(int error_threshold, const char *pattern,
                             const char *text, const int read_length,
                             int *mapping_end_position) {
+0 −112
Original line number Diff line number Diff line
@@ -2551,118 +2551,6 @@ void Chromap<SAMMapping>::EmplaceBackMappingRecord(
      std::string(read), std::string(read_qual)));
}

template <typename MappingRecord>
void Chromap<MappingRecord>::GenerateMDTag(const char *pattern,
                                           const char *text,
                                           int mapping_start_position,
                                           int n_cigar, const uint32_t *cigar,
                                           int &NM, std::string &MD_tag) {
  int num_matches = 0;
  const char *read = text;
  const char *reference = pattern + mapping_start_position;
  int read_position = 0;
  int reference_position = 0;
  for (int ci = 0; ci < n_cigar; ++ci) {
    uint32_t current_cigar_uint = cigar[ci];
    uint8_t cigar_operation = bam_cigar_op(current_cigar_uint);
    int num_cigar_operations = bam_cigar_oplen(current_cigar_uint);
    if (cigar_operation == BAM_CMATCH) {
      for (int opi = 0; opi < num_cigar_operations; ++opi) {
        if (reference[reference_position] == read[read_position] ||
            reference[reference_position] - 'a' + 'A' == read[read_position]) {
          // a match
          ++num_matches;
        } else {
          // a mismatch
          ++NM;
          if (num_matches != 0) {
            MD_tag.append(std::to_string(num_matches));
            num_matches = 0;
          }
          MD_tag.push_back(reference[reference_position]);
        }
        ++reference_position;
        ++read_position;
      }
    } else if (cigar_operation == BAM_CINS) {
      NM += num_cigar_operations;
      read_position += num_cigar_operations;
    } else if (cigar_operation == BAM_CDEL) {
      NM += num_cigar_operations;
      if (num_matches != 0) {
        MD_tag.append(std::to_string(num_matches));
        num_matches = 0;
      }
      MD_tag.push_back('^');
      for (int opi = 0; opi < num_cigar_operations; ++opi) {
        MD_tag.push_back(reference[reference_position]);
        ++reference_position;
      }
    } else {
      std::cerr << "Unexpected cigar op: " << (int)cigar_operation << "\n";
    }
  }
  if (num_matches != 0) {
    MD_tag.append(std::to_string(num_matches));
  }
}

// return: newly adjust reference start/end position (kPositive for start,
// kNegative for end)
template <typename MappingRecord>
int Chromap<MappingRecord>::AdjustGapBeginning(
    Direction mapping_direction, const char *ref, const char *read,
    int *gap_beginning, int read_end, int ref_start_position,
    int ref_end_position, int *n_cigar, uint32_t **cigar) {
  int i, j;
  if (mapping_direction == kPositive) {
    if (*gap_beginning <= 0) {
      return ref_start_position;
    }
    // printf("%d\n", *gap_beginning);
    for (i = *gap_beginning - 1, j = ref_start_position - 1; i >= 0 && j >= 0;
         --i, --j) {
      // printf("%c %c\n", read[i], ref[j]);
      if (read[i] != ref[j] && read[i] != ref[j] - 'a' + 'A') {
        break;
      }
    }
    *gap_beginning = i + 1;
    // TODO: add soft clip in cigar
    if (n_cigar && *n_cigar > 0) {
      if (((*cigar)[0] & 0xf) == BAM_CMATCH) {
        (*cigar)[0] += (ref_start_position - 1 - j) << 4;
      }
    }
    return j + 1;
  } else {
    if (*gap_beginning <= 0) {
      return ref_end_position;
    }
    // printf("%d\n", *gap_beginning);
    /*char *tmp = new char[255] ;
    strncpy(tmp, ref + ref_start_position, ref_end_position - ref_start_position
    + 1 + 10) ; printf("%s %d. %d %d\n", tmp, strlen(tmp), ref_end_position -
    ref_start_position + 1 + 10, strlen(ref)) ; delete[] tmp;*/
    for (i = read_end + 1, j = ref_end_position + 1; read[i] && ref[j];
         ++i, ++j) {
      // printf("%c %c %c %c %c %c\n", read[i], ref[j - 1], ref[j], ref[j + 1],
      // ref[j + 2], ref[j + 3]);
      if (read[i] != ref[j] && read[i] != ref[j] - 'a' + 'A') {
        break;
      }
    }
    *gap_beginning = *gap_beginning + i - (read_end + 1);
    if (n_cigar && *n_cigar > 0) {
      if (((*cigar)[*n_cigar - 1] & 0xf) == BAM_CMATCH) {
        (*cigar)[*n_cigar - 1] += (j - (ref_end_position + 1)) << 4;
      }
    }

    return j - 1;
  }
}

// The returned coordinate is left closed and right closed, and is without chrom
// id.
template <typename MappingRecord>
+0 −9
Original line number Diff line number Diff line
@@ -301,10 +301,6 @@ class Chromap {
                        const SequenceBatch &reference,
                        MappingMetadata &mapping_metadata);

  void GenerateMDTag(const char *pattern, const char *text,
                     int mapping_start_position, int n_cigar,
                     const uint32_t *cigar, int &NM, std::string &MD_tag);

  uint32_t MoveMappingsInBuffersToMappingContainer(
      uint32_t num_reference_sequences,
      std::vector<std::vector<std::vector<MappingRecord> > >
@@ -360,11 +356,6 @@ class Chromap {
                      const SequenceBatch &reference,
                      const std::vector<std::vector<MappingRecord> > &mappings);

  int AdjustGapBeginning(Direction mapping_direction, const char *ref,
                         const char *read, int *gap_beginning, int read_end,
                         int ref_start_position, int ref_end_position,
                         int *n_cigar, uint32_t **cigar);

  void GetRefStartEndPositionForReadFromMapping(
      Direction mapping_direction, const std::pair<int, uint64_t> &mapping,
      const char *read, int read_length, int in_split_site,