Commit b219427b authored by Haowen Zhang's avatar Haowen Zhang
Browse files

Tune mapq

parent 61b8fd1b
Loading
Loading
Loading
Loading
+33 −24
Original line number Diff line number Diff line
@@ -610,7 +610,7 @@ void Chromap<MappingRecord>::MapPairedEndReads() {
          for (uint32_t pair_index = 0; pair_index < num_loaded_pairs; ++pair_index) {
            read_batch1.PrepareNegativeSequenceAt(pair_index);
            read_batch2.PrepareNegativeSequenceAt(pair_index);
            //std::cerr << read_batch1.GetSequenceNameAt(pair_index);
            //std::cerr << read_batch1.GetSequenceNameAt(pair_index) << "\n";
            if (trim_adapters_) {
              TrimAdapterForPairedEndRead(pair_index, &read_batch1, &read_batch2);
            }
@@ -663,7 +663,9 @@ void Chromap<MappingRecord>::MapPairedEndReads() {
                positive_candidates2.clear();
                negative_candidates1.clear();
                negative_candidates2.clear();
                //std::cerr << "#pc1: " << positive_candidates1_buffer.size() << ", #nc1: " << negative_candidates1_buffer.size() << ", #pc2: " << positive_candidates2_buffer.size() << ", #nc2: " << negative_candidates2_buffer.size() << "\n";
                ReduceCandidatesForPairedEndRead(positive_candidates1_buffer, negative_candidates1_buffer, positive_candidates2_buffer, negative_candidates2_buffer, &positive_candidates1, &negative_candidates1, &positive_candidates2, &negative_candidates2);
                //std::cerr << "After pe filter, #pc1: " << positive_candidates1.size() << ", #nc1: " << negative_candidates1.size() << ", #pc2: " << positive_candidates2.size() << ", #nc2: " << negative_candidates2.size() << "\n";
                thread_num_candidates += positive_candidates1.size() + positive_candidates2.size() + negative_candidates1.size() + negative_candidates2.size();
                positive_mappings1.clear();
                positive_mappings2.clear();
@@ -2073,6 +2075,7 @@ void Chromap<MappingRecord>::VerifyCandidates(const SequenceBatch &read_batch, u
  } else {
    std::vector<Candidate> sorted_candidates(positive_candidates);
    std::sort(sorted_candidates.begin(), sorted_candidates.end());
    //std::cerr << "best: " << sorted_candidates[0].count << " " << "second best: " << sorted_candidates[1].count << "\n";
    VerifyCandidatesOnOneDirectionUsingSIMD(kPositive, read_batch, read_index, reference, sorted_candidates, positive_mappings, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings);
    //VerifyCandidatesOnOneDirectionUsingSIMD(kPositive, read_batch, read_index, reference, positive_candidates, positive_mappings, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings);
  }
@@ -2081,6 +2084,7 @@ void Chromap<MappingRecord>::VerifyCandidates(const SequenceBatch &read_batch, u
  } else {
    std::vector<Candidate> sorted_candidates(negative_candidates);
    std::sort(sorted_candidates.begin(), sorted_candidates.end());
    //std::cerr << "best: " << sorted_candidates[0].count << " " << "second best: " << sorted_candidates[1].count << "\n";
    VerifyCandidatesOnOneDirectionUsingSIMD(kNegative, read_batch, read_index, reference, sorted_candidates, negative_mappings, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings);
    //VerifyCandidatesOnOneDirectionUsingSIMD(kNegative, read_batch, read_index, reference, negative_candidates, negative_mappings, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings);
  }
@@ -2522,15 +2526,15 @@ uint8_t Chromap<MappingRecord>::GetMAPQForSingleEndRead(int error_threshold, int
  int mapq = 0;
  if (num_best_mappings > 1) {
    //mapq = -4.343 * log(1 - 1.0 / num_best_mappings);
    if (num_best_mappings == 2) {
      mapq = 3;
    } else if (num_best_mappings == 3) {
      mapq = 2;
    } else if (num_best_mappings < 10) {
      mapq = 1;
    } else {
      mapq = 0;
    }
    //if (num_best_mappings == 2) {
    //  mapq = 3;
    //} else if (num_best_mappings == 3) {
    //  mapq = 2;
    //} else if (num_best_mappings < 10) {
    //  mapq = 1;
    //} else {
    //  mapq = 0;
    //}
  } else {
    //if (second_min_num_errors > error_threshold) {
    //  second_min_num_errors = error_threshold + 1;
@@ -2542,7 +2546,7 @@ uint8_t Chromap<MappingRecord>::GetMAPQForSingleEndRead(int error_threshold, int
    //mapq = alignment_identity < 0.98 ? (int)(mapq * tmp + .499) : mapq;
    double tmp = alignment_length < mapq_coef_length ? 1.0 : mapq_coef_fraction / log(alignment_length);
    tmp *= alignment_identity * alignment_identity;
    mapq = 4 * 6.02 * (second_min_num_errors - min_num_errors) * tmp * tmp + 0.499;
    mapq = 5 * 6.02 * (second_min_num_errors - min_num_errors) * tmp * tmp + 0.499;
  }
  if (num_second_best_mappings > 0) {
    mapq -= (int)(4.343 * log(num_second_best_mappings + 1) + 0.499);
@@ -2553,18 +2557,21 @@ uint8_t Chromap<MappingRecord>::GetMAPQForSingleEndRead(int error_threshold, int
  if (mapq > 60) {
    mapq = 60;
  }
  if (mapq < 0) {
    mapq = 0;
  }
  if (repetitive_seed_length > 0) {
    double frac_rep = (repetitive_seed_length) / (double)alignment_length;
    mapq =  mapq * (1 - frac_rep) + 0.499;
    if (repetitive_seed_length >= alignment_length) {
      frac_rep = 0.999;
    }
  if (mapq < 0) {
    mapq = 0;
    mapq =  mapq * (1 - frac_rep) + 0.499;
  }
  //mapq <<= 1;
  return (uint8_t)mapq;
}

#define raw_mapq(diff, a) ((int)(4 * 6.02 * (diff) / (a) + .499))
#define raw_mapq(diff, a) ((int)(5 * 6.02 * (diff) / (a) + .499))

template <typename MappingRecord>
uint8_t Chromap<MappingRecord>::GetMAPQForPairedEndRead(int num_positive_candidates, int num_negative_candidates, uint32_t repetitive_seed_length1, uint32_t repetitive_seed_length2, uint16_t positive_alignment_length, uint16_t negative_alignment_length, int min_sum_errors, int num_best_mappings, int second_min_sum_errors, int num_second_best_mappings, int min_num_errors1, int min_num_errors2, int num_best_mappings1, int num_best_mappings2, int second_min_num_errors1, int second_min_num_errors2, int num_second_best_mappings1, int num_second_best_mappings2) {
@@ -2589,16 +2596,18 @@ uint8_t Chromap<MappingRecord>::GetMAPQForPairedEndRead(int num_positive_candida
  //mapq2 = mapq2 < max_mapq2 ? mapq2 : max_mapq2;
  mapq1 = mapq1 < raw_mapq(second_min_num_errors1 - min_num_errors1, 1) ? mapq1 : raw_mapq(second_min_num_errors1 - min_num_errors1, 1);
  mapq2 = mapq2 < raw_mapq(second_min_num_errors2 - min_num_errors2, 1) ? mapq2 : raw_mapq(second_min_num_errors2 - min_num_errors2, 1);
  //uint8_t mapq = mapq1 < mapq2 ? mapq1 : mapq2;
  //std::cerr << " 1:" << (int)mapq1 << " 2:" << (int)mapq2 << " mapq_pe:" << (int)mapq_pe << " mapq:" << (int)mapq << "\n";
  uint8_t mapq = mapq1 < mapq2 ? mapq1 : mapq2;
  //uint8_t mapq = (mapq1 + mapq2) / 2;
  uint8_t mapq = (mapq1 + mapq2) / 2 + 0.499;
  if ((mapq1 < mapq2 && mapq1 + 10 > mapq2) || (mapq2 < mapq1 && mapq2 + 10 > mapq1)) {
    //mapq = (mapq1 + mapq2) / 2;
  } else {
    uint8_t min_mapq = mapq1 > mapq2 ? mapq1 : mapq2;
    mapq = (mapq + 1 * min_mapq) / 2 + 0.499;
  }
  //uint8_t mapq = (mapq1 + mapq2) / 2 + 0.499;
  //if ((mapq1 < mapq2 && mapq1 + 30 >= mapq2) || (mapq2 < mapq1 && mapq2 + 30 >= mapq1)) {
  //  uint8_t min_mapq = mapq1 < mapq2 ? mapq1 : mapq2;
  //  mapq = min_mapq;
  //  //mapq = (mapq + 1 * min_mapq) / 2 + 0.499;
  //} else {
  //  uint8_t max_mapq = mapq1 > mapq2 ? mapq1 : mapq2;
  //  mapq = (mapq + 1 * max_mapq) / 2 + 0.499;
  //}
  //std::cerr << " 1:" << (int)mapq1 << " 2:" << (int)mapq2 << " mapq_pe:" << (int)mapq_pe << " mapq:" << (int)mapq << "\n";
  return (uint8_t)mapq;
}

+44 −3
Original line number Diff line number Diff line
@@ -46,6 +46,7 @@ void Index::GenerateMinimizerSketch(const SequenceBatch &sequence_batch, uint32_
  int unambiguous_length = 0;
  int position_in_buffer = 0;
  int min_position = 0;
  //std::pair<uint64_t, uint64_t> pre_seed = {UINT64_MAX, UINT64_MAX};
  for (uint32_t position = 0; position < sequence_length; ++position) {
    uint8_t current_base = SequenceBatch::CharToUint8(sequence[position]);
    std::pair<uint64_t, uint64_t> current_seed = {UINT64_MAX, UINT64_MAX};
@@ -69,6 +70,43 @@ void Index::GenerateMinimizerSketch(const SequenceBatch &sequence_batch, uint32_
    }
    buffer[position_in_buffer] = current_seed; // need to do this here as appropriate position_in_buffer and buf[position_in_buffer] are needed below
    if (unambiguous_length == window_size_ + kmer_size_ - 1 && min_seed.first != UINT64_MAX) { // special case for the first window - because identical k-mers are not stored yet
      //
      //bool found_first_min_in_first_loop = false;
      //bool found_first_min_in_second_loop = false; // For debug
      //int first_min_position_in_buffer = 0;
      //for (int j = position_in_buffer + 1; j < window_size_; ++j)
      //  if (min_seed.first == buffer[j].first) {
      //    found_first_min_in_first_loop = true;
      //    first_min_position_in_buffer = j;
      //    break;
      //  }
      //if (!found_first_min_in_first_loop) {
      //  for (int j = 0; j < position_in_buffer; ++j)
      //    if (min_seed.first == buffer[j].first) {
      //      found_first_min_in_second_loop = true;
      //      first_min_position_in_buffer = j;
      //      break;
      //    }
      //}
      //assert(found_first_min_in_first_loop || found_first_min_in_second_loop);
      //bool found_pre_seed = false;
      //if (found_first_min_in_first_loop) {
      //  for (int j = position_in_buffer + 1; j < first_min_position_in_buffer; ++j)
      //    if (buffer[j].first < pre_seed.first) {
      //      pre_seed = buffer[j];
      //      found_pre_seed = true;
      //    }
      //}
      //if (found_first_min_in_second_loop) {
      //  for (int j = 0; j < first_min_position_in_buffer; ++j)
      //    if (buffer[j].first < pre_seed.first) {
      //      pre_seed = buffer[j];
      //      found_pre_seed = true;
      //    }
      //}
      //if (found_pre_seed)
      //  minimizers->push_back(pre_seed);
      //
      for (int j = position_in_buffer + 1; j < window_size_; ++j)
        if (min_seed.first == buffer[j].first && buffer[j].second != min_seed.second) 
          minimizers->push_back(buffer[j]);
@@ -263,6 +301,7 @@ void Index::Load() {
}

void Index::GenerateCandidatesOnOneDirection(int error_threshold, std::vector<uint64_t> *hits, std::vector<Candidate> *candidates) {
  //std::cerr << "Direction\n";
  hits->emplace_back(UINT64_MAX);
  if (hits->size() > 0) {
    std::sort(hits->begin(), hits->end());
@@ -279,6 +318,7 @@ void Index::GenerateCandidatesOnOneDirection(int error_threshold, std::vector<ui
          nc.position = previous_hit;
          nc.count = count;
          candidates->push_back(nc);
          //std::cerr << count << " ";
        }
        count = 1;
      } else {
@@ -340,12 +380,12 @@ void Index::CollectCandidates(int max_seed_frequency, const std::vector<std::pai
        } 
      } else {
        if (previous_repetitive_seed_position > read_position) { // first minimizer
          *repetitive_seed_length += kmer_size_;
          *repetitive_seed_length += kmer_size_ + window_size_ - 1;
        } else {
          if (read_position < previous_repetitive_seed_position + kmer_size_) {
            *repetitive_seed_length += previous_repetitive_seed_position + kmer_size_ - read_position;
            *repetitive_seed_length += read_position - previous_repetitive_seed_position;
          } else {
            *repetitive_seed_length += kmer_size_;
            *repetitive_seed_length += kmer_size_ + window_size_ - 1;
          }
        }
        previous_repetitive_seed_position = read_position;
@@ -355,6 +395,7 @@ void Index::CollectCandidates(int max_seed_frequency, const std::vector<std::pai
}

void Index::GenerateCandidates(int error_threshold, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, uint32_t *repetitive_seed_length, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits, std::vector<Candidate> *positive_candidates, std::vector<Candidate> *negative_candidates) {
  *repetitive_seed_length = 0;
  CollectCandidates(max_seed_frequencies_[0], minimizers, repetitive_seed_length, positive_hits, negative_hits);
  // Now I can generate primer chain in candidates
  // Let me use sort for now, but I can use merge later.