Commit 9b64a21c authored by Haowen Zhang's avatar Haowen Zhang
Browse files

Introduce an in memory mapping layout.

Use this struct to hold all kinds of detailed mappings in memory. This
also reduce the nubmer of parameters for calling many related functions.

This change also help identify and fix a bug on single-end read SAM
mappings.
parent 0193ab5a
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -2,7 +2,7 @@ CXX=g++
CXXFLAGS=-std=c++11 -Wall -O3 -fopenmp -msse4.1
LDFLAGS=-lm -lz

cpp_source=sequence_batch.cc index.cc candidate_processor.cc alignment.cc feature_barcode_matrix.cc ksw.cc mapping_writer.cc chromap.cc chromap_driver.cc
cpp_source=sequence_batch.cc index.cc candidate_processor.cc alignment.cc feature_barcode_matrix.cc ksw.cc mapping_generator.cc mapping_writer.cc chromap.cc chromap_driver.cc
src_dir=src
objs_dir=objs
objs+=$(patsubst %.cc,$(objs_dir)/%.o,$(cpp_source))
+51 −38
Original line number Diff line number Diff line
@@ -22,8 +22,6 @@ int GetLongestMatchLength(const char *pattern, const char *text,
  return max_match;
}

// return: newly adjust reference start/end position (kPositive for start,
// kNegative for end)
int AdjustGapBeginning(Direction mapping_direction, const char *ref,
                       const char *read, int *gap_beginning, int read_end,
                       int ref_start_position, int ref_end_position,
@@ -33,7 +31,9 @@ int AdjustGapBeginning(Direction mapping_direction, const char *ref,
    if (*gap_beginning <= 0) {
      return ref_start_position;
    }

    // printf("%d\n", *gap_beginning);

    for (i = *gap_beginning - 1, j = ref_start_position - 1; i >= 0 && j >= 0;
         --i, --j) {
      // printf("%c %c\n", read[i], ref[j]);
@@ -41,6 +41,7 @@ int AdjustGapBeginning(Direction mapping_direction, const char *ref,
        break;
      }
    }

    *gap_beginning = i + 1;
    // TODO: add soft clip in cigar
    if (n_cigar && *n_cigar > 0) {
@@ -48,16 +49,20 @@ int AdjustGapBeginning(Direction mapping_direction, const char *ref,
        (*cigar)[0] += (ref_start_position - 1 - j) << 4;
      }
    }

    return j + 1;
  } else {
  }

  if (*gap_beginning <= 0) {
    return ref_end_position;
  }

  // printf("%d\n", *gap_beginning);
  /*char *tmp = new char[255] ;
  strncpy(tmp, ref + ref_start_position, ref_end_position - ref_start_position
  + 1 + 10) ; printf("%s %d. %d %d\n", tmp, strlen(tmp), ref_end_position -
  ref_start_position + 1 + 10, strlen(ref)) ; delete[] tmp;*/

  for (i = read_end + 1, j = ref_end_position + 1; read[i] && ref[j];
       ++i, ++j) {
    // printf("%c %c %c %c %c %c\n", read[i], ref[j - 1], ref[j], ref[j + 1],
@@ -66,7 +71,9 @@ int AdjustGapBeginning(Direction mapping_direction, const char *ref,
      break;
    }
  }

  *gap_beginning = *gap_beginning + i - (read_end + 1);

  if (n_cigar && *n_cigar > 0) {
    if (((*cigar)[*n_cigar - 1] & 0xf) == BAM_CMATCH) {
      (*cigar)[*n_cigar - 1] += (j - (ref_end_position + 1)) << 4;
@@ -75,16 +82,22 @@ int AdjustGapBeginning(Direction mapping_direction, const char *ref,

  return j - 1;
}
}

void GenerateMDTag(const char *pattern, const char *text,
                   int mapping_start_position, int n_cigar,
                   const uint32_t *cigar, int &NM, std::string &MD_tag) {
  int num_matches = 0;
void GenerateNMAndMDTag(const char *pattern, const char *text,
                        int mapping_start_position,
                        MappingInMemory &mapping_in_memory) {
  const char *read = text;
  const char *reference = pattern + mapping_start_position;

  const uint32_t *cigar = mapping_in_memory.cigar;
  const int n_cigar = mapping_in_memory.n_cigar;
  mapping_in_memory.NM = 0;
  mapping_in_memory.MD_tag.clear();

  int num_matches = 0;
  int read_position = 0;
  int reference_position = 0;

  for (int ci = 0; ci < n_cigar; ++ci) {
    uint32_t current_cigar_uint = cigar[ci];
    uint8_t cigar_operation = bam_cigar_op(current_cigar_uint);
@@ -97,28 +110,28 @@ void GenerateMDTag(const char *pattern, const char *text,
          ++num_matches;
        } else {
          // a mismatch
          ++NM;
          ++mapping_in_memory.NM;
          if (num_matches != 0) {
            MD_tag.append(std::to_string(num_matches));
            mapping_in_memory.MD_tag.append(std::to_string(num_matches));
            num_matches = 0;
          }
          MD_tag.push_back(reference[reference_position]);
          mapping_in_memory.MD_tag.push_back(reference[reference_position]);
        }
        ++reference_position;
        ++read_position;
      }
    } else if (cigar_operation == BAM_CINS) {
      NM += num_cigar_operations;
      mapping_in_memory.NM += num_cigar_operations;
      read_position += num_cigar_operations;
    } else if (cigar_operation == BAM_CDEL) {
      NM += num_cigar_operations;
      mapping_in_memory.NM += num_cigar_operations;
      if (num_matches != 0) {
        MD_tag.append(std::to_string(num_matches));
        mapping_in_memory.MD_tag.append(std::to_string(num_matches));
        num_matches = 0;
      }
      MD_tag.push_back('^');
      mapping_in_memory.MD_tag.push_back('^');
      for (int opi = 0; opi < num_cigar_operations; ++opi) {
        MD_tag.push_back(reference[reference_position]);
        mapping_in_memory.MD_tag.push_back(reference[reference_position]);
        ++reference_position;
      }
    } else {
@@ -126,7 +139,7 @@ void GenerateMDTag(const char *pattern, const char *text,
    }
  }
  if (num_matches != 0) {
    MD_tag.append(std::to_string(num_matches));
    mapping_in_memory.MD_tag.append(std::to_string(num_matches));
  }
}

+8 −5
Original line number Diff line number Diff line
#ifndef ALIGNMENT_H_
#define ALIGNMENT_H_

#include "mapping_in_memory.h"
#include "sam_mapping.h"
#include "sequence_batch.h"
#include "utils.h"
@@ -10,16 +11,18 @@ namespace chromap {
int GetLongestMatchLength(const char *pattern, const char *text,
                          const int read_length);

// return: newly adjust reference start/end position (kPositive for start,
// kNegative for end)
// Return newly adjusted reference start/end position for kPositive/kNegative
// mappings.
int AdjustGapBeginning(Direction mapping_direction, const char *ref,
                       const char *read, int *gap_beginning, int read_end,
                       int ref_start_position, int ref_end_position,
                       int *n_cigar, uint32_t **cigar);

void GenerateMDTag(const char *pattern, const char *text,
                   int mapping_start_position, int n_cigar,
                   const uint32_t *cigar, int &NM, std::string &MD_tag);
// Reference (pattern) mapping start postion and cigar must be computed before
// calling this function. Read (text) must be already at the start position.
void GenerateNMAndMDTag(const char *pattern, const char *text,
                        int mapping_start_position,
                        MappingInMemory &mapping_in_memory);

int BandedAlignPatternToText(int error_threshold, const char *pattern,
                             const char *text, const int read_length,
+218 −0
Original line number Diff line number Diff line
#include "mapping_generator.h"

namespace chromap {

// For direction, kPositive is 1, kNegative is 0;
template <>
void MappingGenerator<MappingWithoutBarcode>::EmplaceBackSingleEndMappingRecord(
    MappingInMemory &mapping_in_memory,
    std::vector<std::vector<MappingWithoutBarcode>>
        &mappings_on_diff_ref_seqs) {
  mappings_on_diff_ref_seqs[mapping_in_memory.rid].emplace_back(
      mapping_in_memory.read_id, mapping_in_memory.GetFragmentStartPosition(),
      mapping_in_memory.GetFragmentLength(), mapping_in_memory.mapq,
      mapping_in_memory.GetDirection(), mapping_in_memory.is_unique,
      /*num_dups=*/1);
}

template <>
void MappingGenerator<MappingWithBarcode>::EmplaceBackSingleEndMappingRecord(
    MappingInMemory &mapping_in_memory,
    std::vector<std::vector<MappingWithBarcode>> &mappings_on_diff_ref_seqs) {
  mappings_on_diff_ref_seqs[mapping_in_memory.rid].emplace_back(
      mapping_in_memory.read_id, mapping_in_memory.barcode_key,
      mapping_in_memory.GetFragmentStartPosition(),
      mapping_in_memory.GetFragmentLength(), mapping_in_memory.mapq,
      mapping_in_memory.GetDirection(), mapping_in_memory.is_unique,
      /*num_dups=*/1);
}

template <>
void MappingGenerator<PAFMapping>::EmplaceBackSingleEndMappingRecord(
    MappingInMemory &mapping_in_memory,
    std::vector<std::vector<PAFMapping>> &mappings_on_diff_ref_seqs) {
  mappings_on_diff_ref_seqs[mapping_in_memory.rid].emplace_back(
      mapping_in_memory.read_id, std::string(mapping_in_memory.read_name),
      mapping_in_memory.read_length,
      mapping_in_memory.GetFragmentStartPosition(),
      mapping_in_memory.GetFragmentLength(), mapping_in_memory.mapq,
      mapping_in_memory.GetDirection(), mapping_in_memory.is_unique,
      /*num_dups=*/1);
}

template <>
void MappingGenerator<SAMMapping>::EmplaceBackSingleEndMappingRecord(
    MappingInMemory &mapping_in_memory,
    std::vector<std::vector<SAMMapping>> &mappings_on_diff_ref_seqs) {
  mappings_on_diff_ref_seqs[mapping_in_memory.rid].emplace_back(
      mapping_in_memory.read_id, std::string(mapping_in_memory.read_name),
      mapping_in_memory.barcode_key, /*num_dups=*/1,
      mapping_in_memory.GetFragmentStartPosition(), mapping_in_memory.rid,
      mapping_in_memory.SAM_flag, mapping_in_memory.GetDirection(),
      /*is_alt=*/0, mapping_in_memory.is_unique, mapping_in_memory.mapq,
      mapping_in_memory.NM, mapping_in_memory.n_cigar, mapping_in_memory.cigar,
      mapping_in_memory.MD_tag, std::string(mapping_in_memory.read_sequence),
      std::string(mapping_in_memory.qual_sequence));
}

template <>
void MappingGenerator<PairedEndMappingWithBarcode>::
    EmplaceBackSingleEndMappingRecord(
        MappingInMemory &mapping_in_memory,
        std::vector<std::vector<PairedEndMappingWithBarcode>>
            &mappings_on_diff_ref_seqs) = delete;

template <>
void MappingGenerator<PairedEndMappingWithoutBarcode>::
    EmplaceBackSingleEndMappingRecord(
        MappingInMemory &mapping_in_memory,
        std::vector<std::vector<PairedEndMappingWithoutBarcode>>
            &mappings_on_diff_ref_seqs) = delete;

template <>
void MappingGenerator<PairedPAFMapping>::EmplaceBackSingleEndMappingRecord(
    MappingInMemory &mapping_in_memory,
    std::vector<std::vector<PairedPAFMapping>> &mappings_on_diff_ref_seqs) =
    delete;

template <>
void MappingGenerator<PairsMapping>::EmplaceBackSingleEndMappingRecord(
    MappingInMemory &mapping_in_memory,
    std::vector<std::vector<PairsMapping>> &mappings_on_diff_ref_seqs) = delete;

template <>
void MappingGenerator<SAMMapping>::EmplaceBackPairedEndMappingRecord(
    PairedEndMappingInMemory &paired_end_mapping_in_memory,
    std::vector<std::vector<SAMMapping>> &mappings_on_diff_ref_seqs) {
  EmplaceBackSingleEndMappingRecord(
      paired_end_mapping_in_memory.mapping_in_memory1,
      mappings_on_diff_ref_seqs);
  EmplaceBackSingleEndMappingRecord(
      paired_end_mapping_in_memory.mapping_in_memory2,
      mappings_on_diff_ref_seqs);
}

template <>
void MappingGenerator<PairedEndMappingWithoutBarcode>::
    EmplaceBackPairedEndMappingRecord(
        PairedEndMappingInMemory &paired_end_mapping_in_memory,
        std::vector<std::vector<PairedEndMappingWithoutBarcode>>
            &mappings_on_diff_ref_seqs) {
  mappings_on_diff_ref_seqs[paired_end_mapping_in_memory.mapping_in_memory1.rid]
      .emplace_back(paired_end_mapping_in_memory.GetReadId(),
                    paired_end_mapping_in_memory.GetFragmentStartPosition(),
                    paired_end_mapping_in_memory.GetFragmentLength(),
                    paired_end_mapping_in_memory.mapq,
                    paired_end_mapping_in_memory.GetDirection(),
                    paired_end_mapping_in_memory.is_unique, /*num_dups=*/1,
                    paired_end_mapping_in_memory.GetPositiveAlignmentLength(),
                    paired_end_mapping_in_memory.GetNegativeAlignmentLength());
}

template <>
void MappingGenerator<PairedEndMappingWithBarcode>::
    EmplaceBackPairedEndMappingRecord(
        PairedEndMappingInMemory &paired_end_mapping_in_memory,
        std::vector<std::vector<PairedEndMappingWithBarcode>>
            &mappings_on_diff_ref_seqs) {
  mappings_on_diff_ref_seqs[paired_end_mapping_in_memory.mapping_in_memory1.rid]
      .emplace_back(paired_end_mapping_in_memory.GetReadId(),
                    paired_end_mapping_in_memory.GetBarcode(),
                    paired_end_mapping_in_memory.GetFragmentStartPosition(),
                    paired_end_mapping_in_memory.GetFragmentLength(),
                    paired_end_mapping_in_memory.mapq,
                    paired_end_mapping_in_memory.GetDirection(),
                    paired_end_mapping_in_memory.is_unique, /*num_dups=*/1,
                    paired_end_mapping_in_memory.GetPositiveAlignmentLength(),
                    paired_end_mapping_in_memory.GetNegativeAlignmentLength());
}

template <>
void MappingGenerator<PairedPAFMapping>::EmplaceBackPairedEndMappingRecord(
    PairedEndMappingInMemory &paired_end_mapping_in_memory,
    std::vector<std::vector<PairedPAFMapping>> &mappings_on_diff_ref_seqs) {
  mappings_on_diff_ref_seqs[paired_end_mapping_in_memory.mapping_in_memory1.rid]
      .emplace_back(
          paired_end_mapping_in_memory.GetReadId(),
          std::string(
              paired_end_mapping_in_memory.mapping_in_memory1.read_name),
          std::string(
              paired_end_mapping_in_memory.mapping_in_memory2.read_name),
          paired_end_mapping_in_memory.mapping_in_memory1.read_length,
          paired_end_mapping_in_memory.mapping_in_memory2.read_length,
          paired_end_mapping_in_memory.GetFragmentStartPosition(),
          paired_end_mapping_in_memory.GetNegativeAlignmentLength(),
          paired_end_mapping_in_memory.GetFragmentLength(),
          paired_end_mapping_in_memory.GetPositiveAlignmentLength(),
          paired_end_mapping_in_memory.mapq,
          paired_end_mapping_in_memory.mapping_in_memory1.mapq,
          paired_end_mapping_in_memory.mapping_in_memory2.mapq,
          paired_end_mapping_in_memory.GetDirection(),
          paired_end_mapping_in_memory.is_unique, /*num_dups=*/1);
}

template <>
void MappingGenerator<PairsMapping>::EmplaceBackPairedEndMappingRecord(
    PairedEndMappingInMemory &paired_end_mapping_in_memory,
    std::vector<std::vector<PairsMapping>> &mappings_on_diff_ref_seqs) {
  uint8_t direction1 =
      paired_end_mapping_in_memory.mapping_in_memory1.GetDirection();
  uint8_t direction2 =
      paired_end_mapping_in_memory.mapping_in_memory2.GetDirection();

  int position1 =
      paired_end_mapping_in_memory.mapping_in_memory1.ref_start_position;
  int position2 =
      paired_end_mapping_in_memory.mapping_in_memory2.ref_start_position;

  if (paired_end_mapping_in_memory.mapping_in_memory1.direction == kNegative) {
    position1 =
        paired_end_mapping_in_memory.mapping_in_memory1.ref_end_position;
  }

  if (paired_end_mapping_in_memory.mapping_in_memory2.direction == kNegative) {
    position2 =
        paired_end_mapping_in_memory.mapping_in_memory2.ref_end_position;
  }

  int rid1 = paired_end_mapping_in_memory.mapping_in_memory1.rid;
  int rid2 = paired_end_mapping_in_memory.mapping_in_memory2.rid;
  const int rid1_rank = pairs_custom_rid_rank_[rid1];
  const int rid2_rank = pairs_custom_rid_rank_[rid2];

  const bool is_rid1_rank_smaller =
      rid1_rank < rid2_rank || (rid1 == rid2 && position1 < position2);
  if (!is_rid1_rank_smaller) {
    std::swap(rid1, rid2);
    std::swap(position1, position2);
    std::swap(direction1, direction2);
  }

  mappings_on_diff_ref_seqs[rid1].emplace_back(
      paired_end_mapping_in_memory.GetReadId(),
      std::string(paired_end_mapping_in_memory.mapping_in_memory1.read_name),
      paired_end_mapping_in_memory.GetBarcode(), rid1, rid2, position1,
      position2, direction1, direction2, paired_end_mapping_in_memory.mapq,
      paired_end_mapping_in_memory.is_unique, /*num_dups=*/1);
}

template <>
void MappingGenerator<MappingWithBarcode>::EmplaceBackPairedEndMappingRecord(
    PairedEndMappingInMemory &paired_end_mapping_in_memory,
    std::vector<std::vector<MappingWithBarcode>> &mappings_on_diff_ref_seqs) =
    delete;

template <>
void MappingGenerator<MappingWithoutBarcode>::EmplaceBackPairedEndMappingRecord(
    PairedEndMappingInMemory &paired_end_mapping_in_memory,
    std::vector<std::vector<MappingWithoutBarcode>>
        &mappings_on_diff_ref_seqs) = delete;

template <>
void MappingGenerator<PAFMapping>::EmplaceBackPairedEndMappingRecord(
    PairedEndMappingInMemory &paired_end_mapping_in_memory,
    std::vector<std::vector<PAFMapping>> &mappings_on_diff_ref_seqs) = delete;



}  // namespace chromap
+265 −510

File changed.

Preview size limit exceeded, changes collapsed.

Loading