Commit 9858315d authored by Haowen Zhang's avatar Haowen Zhang
Browse files

Add MD tag for debug

parent b62de826
Loading
Loading
Loading
Loading
+62 −5
Original line number Original line Diff line number Diff line
@@ -1728,12 +1728,63 @@ void Chromap<PAFMapping>::EmplaceBackMappingRecord(uint32_t read_id, const char
}
}


template<typename MappingRecord>
template<typename MappingRecord>
void Chromap<MappingRecord>::EmplaceBackMappingRecord(uint32_t read_id, const char *read_name, uint8_t num_dups, int64_t position, int rid, int flag, uint8_t direction, uint8_t is_unique, uint8_t mapq, uint32_t NM, int n_cigar, uint32_t *cigar, std::vector<MappingRecord> *mappings_on_diff_ref_seqs) {
void Chromap<MappingRecord>::EmplaceBackMappingRecord(uint32_t read_id, const char *read_name, uint8_t num_dups, int64_t position, int rid, int flag, uint8_t direction, uint8_t is_unique, uint8_t mapq, uint32_t NM, int n_cigar, uint32_t *cigar, std::string &MD_tag, std::vector<MappingRecord> *mappings_on_diff_ref_seqs) {
}
}


template<>
template<>
void Chromap<SAMMapping>::EmplaceBackMappingRecord(uint32_t read_id, const char *read_name, uint8_t num_dups, int64_t position, int rid, int flag, uint8_t direction, uint8_t is_unique, uint8_t mapq, uint32_t NM, int n_cigar, uint32_t *cigar, std::vector<SAMMapping> *mappings_on_diff_ref_seqs) {
void Chromap<SAMMapping>::EmplaceBackMappingRecord(uint32_t read_id, const char *read_name, uint8_t num_dups, int64_t position, int rid, int flag, uint8_t direction, uint8_t is_unique, uint8_t mapq, uint32_t NM, int n_cigar, uint32_t *cigar, std::string &MD_tag, std::vector<SAMMapping> *mappings_on_diff_ref_seqs) {
  mappings_on_diff_ref_seqs->emplace_back(SAMMapping{read_id, std::string(read_name), num_dups, position, rid, flag, direction, 0, is_unique, mapq, NM, n_cigar, cigar});
  mappings_on_diff_ref_seqs->emplace_back(SAMMapping{read_id, std::string(read_name), num_dups, position, rid, flag, direction, 0, is_unique, mapq, NM, n_cigar, cigar, MD_tag});
}

template <typename MappingRecord>
void Chromap<MappingRecord>::GenerateMDTag(const char *pattern, const char *text, int mapping_start_position, int n_cigar, const uint32_t *cigar, int &NM, std::string &MD_tag) {
  int num_matches = 0;
  const char *read = text;
  const char *reference = pattern + mapping_start_position;
  int read_position = 0;
  int reference_position = 0;
  for (int ci = 0; ci < n_cigar; ++ci) {
    uint32_t current_cigar_uint = cigar[ci];
    uint8_t cigar_operation = bam_cigar_op(current_cigar_uint);
    int num_cigar_operations = bam_cigar_oplen(current_cigar_uint);
    if (cigar_operation == BAM_CMATCH) {
      for (int opi = 0; opi < num_cigar_operations; ++opi) {
        if (reference[reference_position] == read[read_position]) {
          // a match
          ++num_matches;
        } else {
          //a mismatch
          ++NM;
          if (num_matches != 0) {
            MD_tag.append(std::to_string(num_matches));
            num_matches = 0;
          }
          MD_tag.push_back(reference[reference_position]);
        }
        ++reference_position;
        ++read_position;
      }
    } else if (cigar_operation == BAM_CINS) {
      NM += num_cigar_operations;
      read_position += num_cigar_operations;
    } else if (cigar_operation == BAM_CDEL) {
      NM += num_cigar_operations;
      if (num_matches != 0) {
        MD_tag.append(std::to_string(num_matches));
        num_matches = 0;
      }
      MD_tag.push_back('^');
      for (int opi = 0; opi < num_cigar_operations; ++opi) {
        MD_tag.push_back(reference[reference_position]);
        ++reference_position;
      }
    } else {
      std::cerr << "Unexpected cigar op: " << (int)cigar_operation << "\n";
    }
  }
  if (num_matches != 0) {
    MD_tag.append(std::to_string(num_matches));
  }
}
}


template <typename MappingRecord>
template <typename MappingRecord>
@@ -1786,8 +1837,11 @@ void Chromap<MappingRecord>::ProcessBestMappingsForSingleEndRead(Direction mappi
            uint32_t *cigar;
            uint32_t *cigar;
            int mapping_end_position;
            int mapping_end_position;
            ksw_semi_global3(read_length + 2 * error_threshold_, reference.GetSequenceAt(rid) + verification_window_start_position, read_length, read, 5, mat, gap_open_penalties_[0], gap_extension_penalties_[0], gap_open_penalties_[1], gap_extension_penalties_[1], error_threshold_ * 2 + 1, &n_cigar, &cigar, &mapping_start_position, &mapping_end_position);
            ksw_semi_global3(read_length + 2 * error_threshold_, reference.GetSequenceAt(rid) + verification_window_start_position, read_length, read, 5, mat, gap_open_penalties_[0], gap_extension_penalties_[0], gap_open_penalties_[1], gap_extension_penalties_[1], error_threshold_ * 2 + 1, &n_cigar, &cigar, &mapping_start_position, &mapping_end_position);
            int NM = 0;
            std::string MD_tag = "";
            GenerateMDTag(reference.GetSequenceAt(rid), read, verification_window_start_position + mapping_start_position + 1, n_cigar, cigar, NM, MD_tag);
            mapq = GetMAPQForSingleEndRead(error_threshold_, 0, 0, mapping_end_position - mapping_start_position + 1, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings);
            mapq = GetMAPQForSingleEndRead(error_threshold_, 0, 0, mapping_end_position - mapping_start_position + 1, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings);
            EmplaceBackMappingRecord(read_id, read_name, 1, verification_window_start_position + mapping_start_position, rid, flag, 1, is_unique, mapq, min_num_errors, n_cigar, cigar, &((*mappings_on_diff_ref_seqs)[rid]));
            EmplaceBackMappingRecord(read_id, read_name, 1, verification_window_start_position + mapping_start_position, rid, flag, 1, is_unique, mapq, NM, n_cigar, cigar, MD_tag, &((*mappings_on_diff_ref_seqs)[rid]));
          } else {
          } else {
            //int n_cigar = 0;
            //int n_cigar = 0;
            //uint32_t *cigar;
            //uint32_t *cigar;
@@ -1815,8 +1869,11 @@ void Chromap<MappingRecord>::ProcessBestMappingsForSingleEndRead(Direction mappi
            uint32_t *cigar;
            uint32_t *cigar;
            int mapping_end_position;
            int mapping_end_position;
            ksw_semi_global3(read_length + 2 * error_threshold_, reference.GetSequenceAt(rid) + verification_window_start_position, read_length, negative_read.data() + read_start_site, 5, mat, gap_open_penalties_[0], gap_extension_penalties_[0], gap_open_penalties_[1], gap_extension_penalties_[1], error_threshold_ * 2 + 1, &n_cigar, &cigar, &mapping_start_position, &mapping_end_position);
            ksw_semi_global3(read_length + 2 * error_threshold_, reference.GetSequenceAt(rid) + verification_window_start_position, read_length, negative_read.data() + read_start_site, 5, mat, gap_open_penalties_[0], gap_extension_penalties_[0], gap_open_penalties_[1], gap_extension_penalties_[1], error_threshold_ * 2 + 1, &n_cigar, &cigar, &mapping_start_position, &mapping_end_position);
            int NM = 0;
            std::string MD_tag = "";
            GenerateMDTag(reference.GetSequenceAt(rid), negative_read.data() + read_start_site, verification_window_start_position + mapping_start_position + 1, n_cigar, cigar, NM, MD_tag);
            mapq = GetMAPQForSingleEndRead(error_threshold_, 0, 0, mapping_end_position - mapping_start_position + 1, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings);
            mapq = GetMAPQForSingleEndRead(error_threshold_, 0, 0, mapping_end_position - mapping_start_position + 1, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings);
            EmplaceBackMappingRecord(read_id, read_name, 1, verification_window_start_position + mapping_end_position, rid, flag, 0, is_unique, mapq, min_num_errors, n_cigar, cigar, &((*mappings_on_diff_ref_seqs)[rid]));
            EmplaceBackMappingRecord(read_id, read_name, 1, verification_window_start_position + mapping_end_position, rid, flag, 0, is_unique, mapq, NM, n_cigar, cigar, MD_tag, &((*mappings_on_diff_ref_seqs)[rid]));
          } else {
          } else {
            //int n_cigar = 0;
            //int n_cigar = 0;
            //uint32_t *cigar;
            //uint32_t *cigar;
+2 −1
Original line number Original line Diff line number Diff line
@@ -137,7 +137,7 @@ class Chromap {
  void ProcessBestMappingsForSingleEndRead(Direction mapping_direction, uint8_t mapq, int min_num_errors, int num_best_mappings, int second_min_num_errors, int num_second_best_mappings, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const SequenceBatch &barcode_batch, const std::vector<int> &best_mapping_indices, const std::vector<std::pair<int, uint64_t> > &mappings, const std::vector<int> &split_sites, int *best_mapping_index, int *num_best_mappings_reported, std::vector<std::vector<MappingRecord> > *mappings_on_diff_ref_seqs);
  void ProcessBestMappingsForSingleEndRead(Direction mapping_direction, uint8_t mapq, int min_num_errors, int num_best_mappings, int second_min_num_errors, int num_second_best_mappings, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const SequenceBatch &barcode_batch, const std::vector<int> &best_mapping_indices, const std::vector<std::pair<int, uint64_t> > &mappings, const std::vector<int> &split_sites, int *best_mapping_index, int *num_best_mappings_reported, std::vector<std::vector<MappingRecord> > *mappings_on_diff_ref_seqs);
  void EmplaceBackMappingRecord(uint32_t read_id, uint32_t barcode, uint32_t fragment_start_position, uint16_t fragment_length, uint8_t mapq, uint8_t direction, uint8_t is_unique, uint8_t num_dups, std::vector<MappingRecord> *mappings_on_diff_ref_seqs);
  void EmplaceBackMappingRecord(uint32_t read_id, uint32_t barcode, uint32_t fragment_start_position, uint16_t fragment_length, uint8_t mapq, uint8_t direction, uint8_t is_unique, uint8_t num_dups, std::vector<MappingRecord> *mappings_on_diff_ref_seqs);
  void EmplaceBackMappingRecord(uint32_t read_id, const char* read_name, uint16_t read_length, uint32_t barcode, uint32_t fragment_start_position, uint16_t fragment_length, uint8_t mapq, uint8_t direction, uint8_t is_unique, uint8_t num_dups, std::vector<MappingRecord> *mappings_on_diff_ref_seqs);
  void EmplaceBackMappingRecord(uint32_t read_id, const char* read_name, uint16_t read_length, uint32_t barcode, uint32_t fragment_start_position, uint16_t fragment_length, uint8_t mapq, uint8_t direction, uint8_t is_unique, uint8_t num_dups, std::vector<MappingRecord> *mappings_on_diff_ref_seqs);
  void EmplaceBackMappingRecord(uint32_t read_id, const char *read_name, uint8_t num_dups, int64_t position, int rid, int flag, uint8_t direction, uint8_t is_unique, uint8_t mapq, uint32_t NM, int n_cigar, uint32_t *cigar, std::vector<MappingRecord> *mappings_on_diff_ref_seqs);
  void EmplaceBackMappingRecord(uint32_t read_id, const char *read_name, uint8_t num_dups, int64_t position, int rid, int flag, uint8_t direction, uint8_t is_unique, uint8_t mapq, uint32_t NM, int n_cigar, uint32_t *cigar, std::string &MD_tag, std::vector<MappingRecord> *mappings_on_diff_ref_seqs);
  void ApplyTn5ShiftOnSingleEndMapping(uint32_t num_reference_sequences, std::vector<std::vector<MappingRecord> > *mappings);
  void ApplyTn5ShiftOnSingleEndMapping(uint32_t num_reference_sequences, std::vector<std::vector<MappingRecord> > *mappings);
  uint32_t LoadSingleEndReadsWithBarcodes(SequenceBatch *read_batch, SequenceBatch *barcode_batch);
  uint32_t LoadSingleEndReadsWithBarcodes(SequenceBatch *read_batch, SequenceBatch *barcode_batch);


@@ -155,6 +155,7 @@ class Chromap {
  void VerifyCandidatesOnOneDirectionUsingSIMD(Direction candidate_direction, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<Candidate> &candidates, std::vector<std::pair<int, uint64_t> > *mappings, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void VerifyCandidatesOnOneDirectionUsingSIMD(Direction candidate_direction, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<Candidate> &candidates, std::vector<std::pair<int, uint64_t> > *mappings, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void VerifyCandidatesOnOneDirection(Direction candidate_direction, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<Candidate> &candidates, std::vector<std::pair<int, uint64_t> > *mappings, std::vector<int> *split_sites, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void VerifyCandidatesOnOneDirection(Direction candidate_direction, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<Candidate> &candidates, std::vector<std::pair<int, uint64_t> > *mappings, std::vector<int> *split_sites, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void VerifyCandidates(const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, const std::vector<Candidate> &positive_candidates, const std::vector<Candidate> &negative_candidates, std::vector<std::pair<int, uint64_t> > *positive_mappings, std::vector<int> *positive_split_sites, std::vector<std::pair<int, uint64_t> > *negative_mappings, std::vector<int> *negative_split_sites, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void VerifyCandidates(const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, const std::vector<Candidate> &positive_candidates, const std::vector<Candidate> &negative_candidates, std::vector<std::pair<int, uint64_t> > *positive_mappings, std::vector<int> *positive_split_sites, std::vector<std::pair<int, uint64_t> > *negative_mappings, std::vector<int> *negative_split_sites, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void GenerateMDTag(const char *pattern, const char *text, int mapping_start_position, int n_cigar, const uint32_t *cigar, int &NM, std::string &MD_tag);
  void AllocateMultiMappings(uint32_t num_reference_sequences);
  void AllocateMultiMappings(uint32_t num_reference_sequences);
  void RemovePCRDuplicate(uint32_t num_reference_sequences);
  void RemovePCRDuplicate(uint32_t num_reference_sequences);
  uint32_t MoveMappingsInBuffersToMappingContainer(uint32_t num_reference_sequences, std::vector<std::vector<std::vector<MappingRecord> > > *mappings_on_diff_ref_seqs_for_diff_threads_for_saving);
  uint32_t MoveMappingsInBuffersToMappingContainer(uint32_t num_reference_sequences, std::vector<std::vector<std::vector<MappingRecord> > > *mappings_on_diff_ref_seqs_for_diff_threads_for_saving);
+3 −4
Original line number Original line Diff line number Diff line
@@ -588,7 +588,6 @@ int ksw_semi_global3(int qlen, const char *query, int tlen, const char *target,
		}
		}
		eh[end].h = h1; eh[end].e = MINUS_INF;
		eh[end].h = h1; eh[end].e = MINUS_INF;
	}
	}
	//score = eh[qlen].h;
	score = eh[qlen].h;
	score = eh[qlen].h;


  int max_score_position = qlen;
  int max_score_position = qlen;
@@ -610,10 +609,10 @@ int ksw_semi_global3(int qlen, const char *query, int tlen, const char *target,
			//which = z[(long)i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3;
			//which = z[(long)i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3;
			which = z[(long)i * n_col + (k - i)] >> (which<<1) & 3;
			which = z[(long)i * n_col + (k - i)] >> (which<<1) & 3;
			if (which == 0)      cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k;
			if (which == 0)      cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k;
			else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i;
			else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --i;
			else                 cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k;
			else                 cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --k;
		}
		}
		if (i >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1);
		if (i >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, i + 1);
    if (mapping_start_position) {
    if (mapping_start_position) {
      *mapping_start_position = k;
      *mapping_start_position = k;
    }
    }
+2 −1
Original line number Original line Diff line number Diff line
@@ -214,6 +214,7 @@ struct SAMMapping {
  uint32_t is_rev:1, is_alt:1, is_unique:1, mapq:7, NM:22; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
  uint32_t is_rev:1, is_alt:1, is_unique:1, mapq:7, NM:22; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
  int n_cigar;     // number of CIGAR operations
  int n_cigar;     // number of CIGAR operations
  uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234
  uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234
  std::string MD;
  char *XA;        // alternative mappings
  char *XA;        // alternative mappings


  int score, sub, alt_sc;
  int score, sub, alt_sc;
@@ -751,7 +752,7 @@ inline void SAMOutputTools<SAMMapping>::AppendMapping(uint32_t rid, const Sequen
  //std::string strand = (mapping.direction & 1) == 1 ? "+" : "-";
  //std::string strand = (mapping.direction & 1) == 1 ? "+" : "-";
  //uint32_t mapping_end_position = mapping.fragment_start_position + mapping.fragment_length;
  //uint32_t mapping_end_position = mapping.fragment_start_position + mapping.fragment_length;
  const char *reference_sequence_name = (mapping.flag & BAM_FUNMAP) > 0 ? "*" : reference.GetSequenceNameAt(rid);
  const char *reference_sequence_name = (mapping.flag & BAM_FUNMAP) > 0 ? "*" : reference.GetSequenceNameAt(rid);
  this->AppendMappingOutput(mapping.read_name + "\t" + std::to_string(mapping.flag) + "\t" + std::string(reference_sequence_name) + "\t" + std::to_string(mapping.GetStartPosition()) + "\t" + std::to_string(mapping.mapq) + "\t" + mapping.GenerateCigarString() + "\t*\t" + std::to_string(0) + "\t" + std::to_string(0) + "\t*\t*\t" + mapping.GenerateIntTagString("NM", mapping.NM) + "\n");
  this->AppendMappingOutput(mapping.read_name + "\t" + std::to_string(mapping.flag) + "\t" + std::string(reference_sequence_name) + "\t" + std::to_string(mapping.GetStartPosition()) + "\t" + std::to_string(mapping.mapq) + "\t" + mapping.GenerateCigarString() + "\t*\t" + std::to_string(0) + "\t" + std::to_string(0) + "\t*\t*\t" + mapping.GenerateIntTagString("NM", mapping.NM) + "\tMD:Z:" + mapping.MD + "\n");
}
}
} // namespace chromap
} // namespace chromap