Commit f0db86e2 authored by Li's avatar Li
Browse files

Avoid repeated candidate add in supplementary

parent 93a80914
Loading
Loading
Loading
Loading
+65 −45
Original line number Diff line number Diff line
@@ -494,11 +494,31 @@ void Index::GenerateCandidatesFromRepetitiveReadWithMateInfo(int error_threshold
  }
  //if (best_candidate_num != 1 || max_count < min_num_seeds_required_for_mapping_) 
  //  return;

  std::vector<std::pair<uint64_t, uint64_t> > boundaries ;
  for (uint32_t ci = 0; ci < mate_candidates_size; ++ci){
    if (mate_candidates->at(ci).count != max_count) {
      continue;
    if (mate_candidates->at(ci).count == max_count) {
      std::pair<uint64_t, uint64_t> r;
      r.first = (mate_candidates->at(ci).position < range) ? 0 : (mate_candidates->at(ci).position - range);
      r.second = mate_candidates->at(ci).position + range;
      boundaries.push_back(r);
    }
  }
  
  // Merge adjacent boundary point. assume the candidates are sorted by coordinate so boundaries are also sorted.
  uint32_t boundary_size = 1;
  uint32_t raw_boundary_size = boundaries.size();
  if (raw_boundary_size == 0)
    return;
  for (uint32_t bi = 1; bi < raw_boundary_size; ++bi){
    if (boundaries[boundary_size - 1].second < boundaries[bi].first){
      boundaries[boundary_size] = boundaries[bi];
      ++boundary_size;
    } else {
      boundaries[boundary_size - 1].second = boundaries[bi].second;
    }
  }
  boundaries.resize(boundary_size);

  *repetitive_seed_length = 0 ;
  for (uint32_t mi = 0; mi < num_minimizers; ++mi) {
    khiter_t khash_iterator = kh_get(k64, lookup_table_, minimizers[mi].first << 1);
@@ -529,14 +549,15 @@ void Index::GenerateCandidatesFromRepetitiveReadWithMateInfo(int error_threshold
    } else {
      uint32_t offset = value >> 32;
      uint32_t num_occurrences = value;
      for (uint32_t bi = 0; bi < boundary_size; ++bi) {
        // use binary search to locate the coordinate near mate position
        int32_t l = 0, m = 0, r = num_occurrences - 1;
        uint64_t boundary = (mate_candidates->at(ci).position < range) ? 0 : (mate_candidates->at(ci).position - range) ;
        uint64_t boundary = boundaries[bi].first ;
        while (l <= r) {
          m = (l + r) / 2;
          uint64_t value = (occurrence_table_[offset + m])>>1;
          //std::cerr << "l: " << l << ", r: " << r << ", m: " << m << ", val: " << (value >> 32) << ", " << (uint32_t)value << ", bd: " << (boundary >> 32) << ", " << (uint32_t)(boundary) << "\n";
          //if (value <= boundary) {
          //if (value <= boundary) 
          if (value < boundary) {
            l = m + 1;
          } else if (value > boundary) {
@@ -548,7 +569,7 @@ void Index::GenerateCandidatesFromRepetitiveReadWithMateInfo(int error_threshold
        //printf("%s: %d %d: %d %d\n", __func__, m, num_occurrences, (int)(boundary>>32), (int)boundary) ;
        for (uint32_t oi = m; oi < num_occurrences; ++oi) {
          uint64_t value = occurrence_table_[offset + oi];
          if ((value >> 1) > mate_candidates->at(ci).position + range)
          if ((value >> 1) > boundaries[bi].second)
            break;
          uint64_t reference_id = value >> 33;
          uint32_t reference_position = value >> 1;
@@ -564,7 +585,7 @@ void Index::GenerateCandidatesFromRepetitiveReadWithMateInfo(int error_threshold
            hits->push_back(candidate);
          }
        }  

      } // for bi
      if (num_occurrences >= (uint32_t)max_seed_frequencies_[0]) {
        if (previous_repetitive_seed_position > read_position) { // first minimizer
          *repetitive_seed_length += kmer_size_;
@@ -577,9 +598,8 @@ void Index::GenerateCandidatesFromRepetitiveReadWithMateInfo(int error_threshold
        }
        previous_repetitive_seed_position = read_position;
      }
    } // for if-else on occurence
  } // for mi
    }
  }

  std::sort(hits->begin(), hits->end());
  //for (uint32_t i = 0 ; i < hits->size(); ++i)