Commit 78967af2 authored by Li's avatar Li Committed by Li Song
Browse files

Preliminary implementation of adding paired-end information in SAM output.

parent 29bc02d0
Loading
Loading
Loading
Loading
+21 −6
Original line number Original line Diff line number Diff line
@@ -48,6 +48,7 @@ void MappingGenerator<SAMMapping>::EmplaceBackSingleEndMappingRecord(
      mapping_in_memory.read_id, std::string(mapping_in_memory.read_name),
      mapping_in_memory.read_id, std::string(mapping_in_memory.read_name),
      mapping_in_memory.barcode_key, /*num_dups=*/1,
      mapping_in_memory.barcode_key, /*num_dups=*/1,
      mapping_in_memory.GetFragmentStartPosition(), mapping_in_memory.rid,
      mapping_in_memory.GetFragmentStartPosition(), mapping_in_memory.rid,
      /*mpos=*/0, /*mrid=*/-1, 0, 
      mapping_in_memory.SAM_flag, mapping_in_memory.GetStrand(),
      mapping_in_memory.SAM_flag, mapping_in_memory.GetStrand(),
      /*is_alt=*/0, mapping_in_memory.is_unique, mapping_in_memory.mapq,
      /*is_alt=*/0, mapping_in_memory.is_unique, mapping_in_memory.mapq,
      mapping_in_memory.NM, mapping_in_memory.n_cigar, mapping_in_memory.cigar,
      mapping_in_memory.NM, mapping_in_memory.n_cigar, mapping_in_memory.cigar,
@@ -84,12 +85,26 @@ template <>
void MappingGenerator<SAMMapping>::EmplaceBackPairedEndMappingRecord(
void MappingGenerator<SAMMapping>::EmplaceBackPairedEndMappingRecord(
    PairedEndMappingInMemory &paired_end_mapping_in_memory,
    PairedEndMappingInMemory &paired_end_mapping_in_memory,
    std::vector<std::vector<SAMMapping>> &mappings_on_diff_ref_seqs) {
    std::vector<std::vector<SAMMapping>> &mappings_on_diff_ref_seqs) {
  EmplaceBackSingleEndMappingRecord(
  int tlen = (int)paired_end_mapping_in_memory.GetFragmentLength();
      paired_end_mapping_in_memory.mapping_in_memory1,
  for (int i = 0; i < 2; ++i) {
      mappings_on_diff_ref_seqs);
    MappingInMemory &mapping_in_memory = (i == 0 ? paired_end_mapping_in_memory.mapping_in_memory1 :
  EmplaceBackSingleEndMappingRecord(
        paired_end_mapping_in_memory.mapping_in_memory2);
      paired_end_mapping_in_memory.mapping_in_memory2,
    MappingInMemory &mate_mapping_in_memory = (i == 0 ? paired_end_mapping_in_memory.mapping_in_memory2 :
      mappings_on_diff_ref_seqs);
        paired_end_mapping_in_memory.mapping_in_memory1);
  
    mappings_on_diff_ref_seqs[mapping_in_memory.rid].emplace_back(
      mapping_in_memory.read_id, std::string(mapping_in_memory.read_name),
      mapping_in_memory.barcode_key, /*num_dups=*/1,
      mapping_in_memory.GetFragmentStartPosition(), mapping_in_memory.rid,
      /*mpos=*/mate_mapping_in_memory.GetFragmentStartPosition(), 
      /*mrid=*/mate_mapping_in_memory.rid, 
      /*tlen=*/mapping_in_memory.GetStrand() ? tlen : -tlen, 
      mapping_in_memory.SAM_flag, mapping_in_memory.GetStrand(),
      /*is_alt=*/0, mapping_in_memory.is_unique, mapping_in_memory.mapq,
      mapping_in_memory.NM, mapping_in_memory.n_cigar, mapping_in_memory.cigar,
      mapping_in_memory.MD_tag, std::string(mapping_in_memory.read_sequence),
      std::string(mapping_in_memory.qual_sequence));
  }
}
}


template <>
template <>
+7 −1
Original line number Original line Diff line number Diff line
@@ -329,13 +329,19 @@ void MappingWriter<SAMMapping>::AppendMapping(uint32_t rid,
  // mapping.fragment_length;
  // mapping.fragment_length;
  const char *reference_sequence_name =
  const char *reference_sequence_name =
      (mapping.flag_ & BAM_FUNMAP) > 0 ? "*" : reference.GetSequenceNameAt(rid);
      (mapping.flag_ & BAM_FUNMAP) > 0 ? "*" : reference.GetSequenceNameAt(rid);
  const char *mate_ref_sequence_name =
      mapping.mrid_ < 0 ? "*" : 
      ((uint32_t)mapping.mrid_ == rid ? "=" : reference.GetSequenceNameAt(mapping.mrid_));
  const uint32_t mapping_start_position = mapping.GetStartPosition();
  const uint32_t mapping_start_position = mapping.GetStartPosition();
  const uint32_t mate_mapping_start_position = mapping.mrid_ < 0 ? 0 : (mapping.mpos_ + 1);
  this->AppendMappingOutput(
  this->AppendMappingOutput(
      mapping.read_name_ + "\t" + std::to_string(mapping.flag_) + "\t" +
      mapping.read_name_ + "\t" + std::to_string(mapping.flag_) + "\t" +
      std::string(reference_sequence_name) + "\t" +
      std::string(reference_sequence_name) + "\t" +
      std::to_string(mapping_start_position) + "\t" +
      std::to_string(mapping_start_position) + "\t" +
      std::to_string(mapping.mapq_) + "\t" + mapping.GenerateCigarString() +
      std::to_string(mapping.mapq_) + "\t" + mapping.GenerateCigarString() +
      "\t*\t" + std::to_string(0) + "\t" + std::to_string(0) + "\t" +
      "\t" + std::string(mate_ref_sequence_name) + "\t" + 
      std::to_string(mate_mapping_start_position) + "\t" + 
      std::to_string(mapping.tlen_) + "\t" +
      mapping.sequence_ + "\t" + mapping.sequence_qual_ + "\t" +
      mapping.sequence_ + "\t" + mapping.sequence_qual_ + "\t" +
      mapping.GenerateIntTagString("NM", mapping.NM_) +
      mapping.GenerateIntTagString("NM", mapping.NM_) +
      "\tMD:Z:" + mapping.MD_);
      "\tMD:Z:" + mapping.MD_);
+21 −4
Original line number Original line Diff line number Diff line
@@ -135,6 +135,9 @@ class SAMMapping : public Mapping {


  int64_t pos_;  // forward strand 5'-end mapping position (inclusive)
  int64_t pos_;  // forward strand 5'-end mapping position (inclusive)
  int rid_;      // reference sequence index in bntseq_t; <0 for unmapped
  int rid_;      // reference sequence index in bntseq_t; <0 for unmapped
  int64_t mpos_;  // forward strand 5'-end mapping position for mate (inclusive)
  int mrid_;      // reference sequence index in bntseq_t; <0 for unmapped
  int tlen_;      // template length
  int flag_;     // extra flag
  int flag_;     // extra flag
  uint32_t is_rev_ : 1, is_alt_ : 1, is_unique_ : 1, mapq_ : 7,
  uint32_t is_rev_ : 1, is_alt_ : 1, is_unique_ : 1, mapq_ : 7,
      NM_ : 22;      // is_rev: whether on the reverse strand; mapq: mapping
      NM_ : 22;      // is_rev: whether on the reverse strand; mapq: mapping
@@ -152,6 +155,7 @@ class SAMMapping : public Mapping {


  SAMMapping(uint32_t read_id, const std::string &read_name,
  SAMMapping(uint32_t read_id, const std::string &read_name,
             uint64_t cell_barcode, uint8_t num_dups, int64_t pos, int rid,
             uint64_t cell_barcode, uint8_t num_dups, int64_t pos, int rid,
             int64_t mpos, int mrid, int tlen, 
             int flag, uint8_t is_rev, uint8_t is_alt, uint8_t is_unique,
             int flag, uint8_t is_rev, uint8_t is_alt, uint8_t is_unique,
             uint8_t mapq, uint32_t NM, int n_cigar, uint32_t *cigar,
             uint8_t mapq, uint32_t NM, int n_cigar, uint32_t *cigar,
             const std::string &MD_tag, const std::string &sequence,
             const std::string &MD_tag, const std::string &sequence,
@@ -162,6 +166,9 @@ class SAMMapping : public Mapping {
        num_dups_(num_dups),
        num_dups_(num_dups),
        pos_(pos),
        pos_(pos),
        rid_(rid),
        rid_(rid),
        mpos_(mpos),
        mrid_(mrid),
        tlen_(tlen),
        flag_(flag),
        flag_(flag),
        is_rev_(is_rev),
        is_rev_(is_rev),
        is_alt_(is_alt),
        is_alt_(is_alt),
@@ -194,13 +201,13 @@ class SAMMapping : public Mapping {
  bool operator<(const SAMMapping &m) const {
  bool operator<(const SAMMapping &m) const {
    int read1_flag = flag_ & BAM_FREAD1;
    int read1_flag = flag_ & BAM_FREAD1;
    int m_read1_flag = m.flag_ & BAM_FREAD1;
    int m_read1_flag = m.flag_ & BAM_FREAD1;
    return std::tie(rid_, pos_, cell_barcode_, mapq_, read_id_, read1_flag) <
    return std::tie(rid_, pos_, cell_barcode_, mapq_, mrid_, mpos_, read_id_, read1_flag) <
           std::tie(m.rid_, m.pos_, m.cell_barcode_, m.mapq_, m.read_id_,
           std::tie(m.rid_, m.pos_, m.cell_barcode_, m.mapq_, m.mrid_, m.mpos_, m.read_id_,
                    m_read1_flag);
                    m_read1_flag);
  }
  }
  bool operator==(const SAMMapping &m) const {
  bool operator==(const SAMMapping &m) const {
    return std::tie(pos_, rid_, cell_barcode_, is_rev_) ==
    return std::tie(pos_, rid_, cell_barcode_, is_rev_, mrid_, mpos_) ==
           std::tie(m.pos_, m.rid_, m.cell_barcode_, m.is_rev_);
           std::tie(m.pos_, m.rid_, m.cell_barcode_, m.is_rev_, mrid_, mpos_);
  }
  }
  bool IsSamePosition(const SAMMapping &m) const {
  bool IsSamePosition(const SAMMapping &m) const {
    return std::tie(pos_, rid_, is_rev_) == std::tie(m.pos_, m.rid_, m.is_rev_);
    return std::tie(pos_, rid_, is_rev_) == std::tie(m.pos_, m.rid_, m.is_rev_);
@@ -300,6 +307,12 @@ class SAMMapping : public Mapping {
        fwrite(&rid_, sizeof(int), 1, temp_mapping_output_file);
        fwrite(&rid_, sizeof(int), 1, temp_mapping_output_file);
    num_written_bytes +=
    num_written_bytes +=
        fwrite(&flag_, sizeof(int), 1, temp_mapping_output_file);
        fwrite(&flag_, sizeof(int), 1, temp_mapping_output_file);
    num_written_bytes +=
        fwrite(&mpos_, sizeof(int64_t), 1, temp_mapping_output_file);
    num_written_bytes +=
        fwrite(&mrid_, sizeof(int), 1, temp_mapping_output_file);
    num_written_bytes +=
        fwrite(&tlen_, sizeof(int), 1, temp_mapping_output_file);
    uint32_t rev_alt_unique_mapq_NM = (is_rev_ << 31) | (is_alt_ << 30) |
    uint32_t rev_alt_unique_mapq_NM = (is_rev_ << 31) | (is_alt_ << 30) |
                                      (is_unique_ << 29) | (mapq_ << 22) | NM_;
                                      (is_unique_ << 29) | (mapq_ << 22) | NM_;
    num_written_bytes += fwrite(&rev_alt_unique_mapq_NM, sizeof(uint32_t), 1,
    num_written_bytes += fwrite(&rev_alt_unique_mapq_NM, sizeof(uint32_t), 1,
@@ -344,6 +357,10 @@ class SAMMapping : public Mapping {
    num_read_bytes +=
    num_read_bytes +=
        fread(&pos_, sizeof(int64_t), 1, temp_mapping_output_file);
        fread(&pos_, sizeof(int64_t), 1, temp_mapping_output_file);
    num_read_bytes += fread(&rid_, sizeof(int), 1, temp_mapping_output_file);
    num_read_bytes += fread(&rid_, sizeof(int), 1, temp_mapping_output_file);
    num_read_bytes +=
        fread(&mpos_, sizeof(int64_t), 1, temp_mapping_output_file);
    num_read_bytes += fread(&mrid_, sizeof(int), 1, temp_mapping_output_file);
    num_read_bytes += fread(&tlen_, sizeof(int), 1, temp_mapping_output_file);
    num_read_bytes += fread(&flag_, sizeof(int), 1, temp_mapping_output_file);
    num_read_bytes += fread(&flag_, sizeof(int), 1, temp_mapping_output_file);
    uint32_t rev_alt_unique_mapq_NM = 0;
    uint32_t rev_alt_unique_mapq_NM = 0;
    num_read_bytes += fread(&rev_alt_unique_mapq_NM, sizeof(uint32_t), 1,
    num_read_bytes += fread(&rev_alt_unique_mapq_NM, sizeof(uint32_t), 1,