Commit 932e13c2 authored by Haowen Zhang's avatar Haowen Zhang
Browse files

Fix a bug on second best mappings of paired-end reads and tune mapq

parent 75e7722d
Loading
Loading
Loading
Loading
+36 −18
Original line number Diff line number Diff line
@@ -1039,6 +1039,9 @@ void Chromap<MappingRecord>::GenerateBestMappingsForPairedEndReadOnOneDirection(
          best_mappings->emplace_back(i1, current_i2);
        } else if (current_sum_errors == *second_min_sum_errors) {
          (*num_second_best_mappings)++;
        } else if (current_sum_errors < *second_min_sum_errors) {
          *second_min_sum_errors = current_sum_errors;
          *num_second_best_mappings = 1;
        }
        ++current_i2;
      }
@@ -1460,7 +1463,7 @@ void Chromap<MappingRecord>::ProcessBestMappingsForSingleEndRead(Direction mappi
          BandedTraceback(min_num_errors, reference.GetSequenceAt(rid) + verification_window_start_position, read, read_length, &mapping_start_position);
          uint32_t fragment_start_position = verification_window_start_position + mapping_start_position;
          uint16_t fragment_length = position - fragment_start_position + 1;
          mapq = GetMAPQForSingleEndRead(0, 0, fragment_length, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings);
          mapq = GetMAPQForSingleEndRead(error_threshold_, 0, 0, fragment_length, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings);
          uint8_t direction = 1;
          //mapq |= 1;
          if (output_mapping_in_PAF_) {
@@ -1472,7 +1475,7 @@ void Chromap<MappingRecord>::ProcessBestMappingsForSingleEndRead(Direction mappi
          BandedTraceback(min_num_errors, reference.GetSequenceAt(rid) + verification_window_start_position, negative_read.data(), read_length, &mapping_start_position);
          uint32_t fragment_start_position = verification_window_start_position + mapping_start_position;
          uint16_t fragment_length = position - fragment_start_position + 1;
          mapq = GetMAPQForSingleEndRead(0, 0, fragment_length, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings);
          mapq = GetMAPQForSingleEndRead(error_threshold_, 0, 0, fragment_length, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings);
          uint8_t direction = 0;
          if (output_mapping_in_PAF_) {
            EmplaceBackMappingRecord(read_id, read_name, (uint16_t)read_length, barcode_key, fragment_start_position, fragment_length, mapq, direction, is_unique, 1, &((*mappings_on_diff_ref_seqs)[rid]));
@@ -2510,9 +2513,9 @@ void Chromap<MappingRecord>::LoadBarcodeWhitelist() {
}

template <typename MappingRecord>
uint8_t Chromap<MappingRecord>::GetMAPQForSingleEndRead(int num_candidates, uint32_t repetitive_seed_length, uint16_t alignment_length, int min_num_errors, int num_best_mappings, int second_min_num_errors, int num_second_best_mappings) {
uint8_t Chromap<MappingRecord>::GetMAPQForSingleEndRead(int error_threshold, int num_candidates, uint32_t repetitive_seed_length, uint16_t alignment_length, int min_num_errors, int num_best_mappings, int second_min_num_errors, int num_second_best_mappings) {
  int mapq_coef = 60;
  //int mapq_coef_length = 60;
  //int mapq_coef_length = 45;
  //double mapq_coef_fraction = log(mapq_coef_length);
  double alignment_identity = 1 - (double)min_num_errors / alignment_length;
  int mapq = 0;
@@ -2528,14 +2531,13 @@ uint8_t Chromap<MappingRecord>::GetMAPQForSingleEndRead(int num_candidates, uint
      mapq = 0;
    }
  } else {
    if (second_min_num_errors > error_threshold_) {
      second_min_num_errors = 2 * error_threshold_ + 1;
    if (second_min_num_errors > error_threshold) {
      second_min_num_errors = error_threshold + 1;
    }
    //mapq = (int)(mapq_coef * (1. - (double)min_num_errors / second_min_num_errors) + .499);
    mapq = (int)(mapq_coef * ((double)(second_min_num_errors - min_num_errors) / second_min_num_errors) + .499);
    double tmp = alignment_identity * alignment_identity;
    tmp = tmp * tmp;
    tmp = tmp * tmp;
    //tmp = tmp * tmp;
    mapq = alignment_identity < 0.98 ? (int)(mapq * tmp + .499) : mapq;
    //double tmp = alignment_length < mapq_coef_length ? 1.0 : mapq_coef_fraction / log(alignment_length);
    //tmp *= alignment_identity * alignment_identity;
@@ -2561,21 +2563,37 @@ uint8_t Chromap<MappingRecord>::GetMAPQForSingleEndRead(int num_candidates, uint
  return (uint8_t)mapq;
}

#define raw_mapq(diff, a) ((int)(6.02 * (diff) / (a) + .499))

template <typename MappingRecord>
uint8_t Chromap<MappingRecord>::GetMAPQForPairedEndRead(int num_positive_candidates, int num_negative_candidates, uint32_t repetitive_seed_length1, uint32_t repetitive_seed_length2, uint16_t positive_alignment_length, uint16_t negative_alignment_length, int min_sum_errors, int num_best_mappings, int second_min_sum_errors, int num_second_best_mappings, int min_num_errors1, int min_num_errors2, int num_best_mappings1, int num_best_mappings2, int second_min_num_errors1, int second_min_num_errors2, int num_second_best_mappings1, int num_second_best_mappings2) {
  uint8_t mapq_pe = GetMAPQForSingleEndRead(num_positive_candidates + num_negative_candidates, repetitive_seed_length1 + repetitive_seed_length2, positive_alignment_length + negative_alignment_length, min_sum_errors, num_best_mappings, second_min_sum_errors, num_second_best_mappings);
  uint8_t mapq1 = GetMAPQForSingleEndRead(num_positive_candidates, repetitive_seed_length1, positive_alignment_length, min_num_errors1, num_best_mappings1, second_min_num_errors1, num_second_best_mappings1);
  uint8_t mapq2 = GetMAPQForSingleEndRead(num_negative_candidates, repetitive_seed_length2, negative_alignment_length, min_num_errors2, num_best_mappings2, second_min_num_errors2, num_second_best_mappings2);
  uint8_t mapq_pe = GetMAPQForSingleEndRead(2 * error_threshold_, num_positive_candidates + num_negative_candidates, repetitive_seed_length1 + repetitive_seed_length2, positive_alignment_length + negative_alignment_length, min_sum_errors, num_best_mappings, second_min_sum_errors, num_second_best_mappings);
  uint8_t mapq1 = GetMAPQForSingleEndRead(error_threshold_, num_positive_candidates, repetitive_seed_length1, positive_alignment_length, min_num_errors1, num_best_mappings1, second_min_num_errors1, num_second_best_mappings1);
  uint8_t mapq2 = GetMAPQForSingleEndRead(error_threshold_, num_negative_candidates, repetitive_seed_length2, negative_alignment_length, min_num_errors2, num_best_mappings2, second_min_num_errors2, num_second_best_mappings2);
  mapq1 = mapq1 > mapq_pe ? mapq1 : mapq_pe < mapq1 + 40? mapq_pe : mapq1 + 40;
  mapq2 = mapq2 > mapq_pe ? mapq2 : mapq_pe < mapq2 + 40? mapq_pe : mapq2 + 40;
  //uint8_t mapq = (mapq1 + mapq2) / 2;
  uint8_t mapq = (mapq1 + mapq2) / 2 + 0.499;
  if ((mapq1 < mapq2 && mapq1 + 10 > mapq2) || (mapq2 < mapq1 && mapq2 + 10 > mapq1)) {
    //mapq = (mapq1 + mapq2) / 2;
  } else {
    uint8_t min_mapq = mapq1 > mapq2 ? mapq1 : mapq2;
    mapq = (mapq + min_mapq) / 2 + 0.499;
  if (second_min_num_errors1 > error_threshold_) {
    second_min_num_errors1 = 2 * error_threshold_ + 1;
  }
  if (second_min_num_errors2 > error_threshold_) {
    second_min_num_errors2 = 2 * error_threshold_ + 1;
  }
  int mapq_coef = 60;
  uint8_t max_mapq1 = (int)(mapq_coef * ((double)(second_min_num_errors1 - min_num_errors1) / second_min_num_errors1) + .499);
  uint8_t max_mapq2 = (int)(mapq_coef * ((double)(second_min_num_errors2 - min_num_errors2) / second_min_num_errors2) + .499);
  mapq1 = mapq1 < max_mapq1 ? mapq1 : max_mapq1;
  mapq2 = mapq2 < max_mapq2 ? mapq2 : max_mapq2;
  //mapq1 = mapq1 < raw_mapq(second_min_num_errors1 - min_num_errors1, 1) ? mapq1 : raw_mapq(second_min_num_errors1 - min_num_errors1, 1);
  //mapq2 = mapq2 < raw_mapq(second_min_num_errors2 - min_num_errors2, 1) ? mapq1 : raw_mapq(second_min_num_errors2 - min_num_errors2, 1);
  uint8_t mapq = mapq1 < mapq2 ? mapq1 : mapq2;
  //uint8_t mapq = (mapq1 + mapq2) / 2;
  //uint8_t mapq = (mapq1 + mapq2) / 2 + 0.499;
  //if ((mapq1 < mapq2 && mapq1 + 10 > mapq2) || (mapq2 < mapq1 && mapq2 + 10 > mapq1)) {
  //  //mapq = (mapq1 + mapq2) / 2;
  //} else {
  //  uint8_t min_mapq = mapq1 > mapq2 ? mapq1 : mapq2;
  //  mapq = (mapq + 3 * min_mapq) / 4 + 0.499;
  //}
  return (uint8_t)mapq;
}

+1 −1
Original line number Diff line number Diff line
@@ -150,7 +150,7 @@ class Chromap {
  void OutputBarcodeStatistics();
  void OutputMappingStatistics();
  void OutputMappingStatistics(uint32_t num_reference_sequences, const std::vector<std::vector<MappingRecord> > &uni_mappings, const std::vector<std::vector<MappingRecord> > &multi_mappings);
  uint8_t GetMAPQForSingleEndRead(int num_candidates, uint32_t repetitive_seed_length, uint16_t alignment_length, int min_num_errors, int num_best_mappings, int second_min_num_errors, int num_second_best_mappings);
  uint8_t GetMAPQForSingleEndRead(int error_threshold, int num_candidates, uint32_t repetitive_seed_length, uint16_t alignment_length, int min_num_errors, int num_best_mappings, int second_min_num_errors, int num_second_best_mappings);
  uint8_t GetMAPQForPairedEndRead(int num_positive_candidates, int num_negative_candidates, uint32_t repetitive_seed_length1, uint32_t repetitive_seed_length2, uint16_t positive_alignment_length, uint16_t negative_alignment_length, int min_sum_errors, int num_best_mappings, int second_min_sum_errors, int num_second_best_mappings, int min_num_errors1, int min_num_errors2, int num_best_mappings1, int num_best_mappings2, int second_min_num_errors1, int second_min_num_errors2, int num_second_best_mappings1, int num_second_best_mappings2);
  void SortOutputMappings(uint32_t num_reference_sequences, std::vector<std::vector<MappingRecord> > *mappings);
  void BuildAugmentedTree(uint32_t ref_id);