Commit 8419f917 authored by Li's avatar Li
Browse files

Fix several issues with computing repetitive_seed_length

parent 624b80dc
Loading
Loading
Loading
Loading
+6 −4
Original line number Diff line number Diff line
@@ -719,7 +719,6 @@ void Chromap<MappingRecord>::MapPairedEndReads() {
		      if (augment_flag) {
			      positive_hits->clear();
			      negative_hits->clear();
			      *repetitive_seed_length = 0;
			      if (mate_positive_candidates->size() > 0) 
				      index.GenerateCandidatesFromRepetitiveReadWithMateInfo(error_threshold_, *minimizers, repetitive_seed_length, negative_hits, augment_negative_candidates, mate_positive_candidates, -1, 2 * max_insert_size_) ;
			      if (mate_negative_candidates->size() > 0)
@@ -1130,7 +1129,7 @@ void Chromap<MappingRecord>::GenerateBestMappingsForPairedEndReadOnOneDirection(
      uint32_t current_i2 = i2;
      while (current_i2 < mappings2.size() && ((first_read_direction == kPositive && mappings2[current_i2].second <= mappings1[i1].second + max_insert_size_ - read2_length) || (first_read_direction == kNegative && mappings2[current_i2].second <= mappings1[i1].second + read1_length - min_overlap_length))) {
#ifdef LI_DEBUG 
	printf("%s: %llu %d %llu %d\n", __func__, mappings1[i1].second >> 32, int(mappings1[i1].second), mappings2[i2].second>>32, int(mappings2[i2].second)) ;
	printf("%s passed: %llu %d %llu %d: %d %d %d\n", __func__, mappings1[i1].second >> 32, int(mappings1[i1].second), mappings2[current_i2].second>>32, int(mappings2[current_i2].second), mappings1[i1].first + mappings2[current_i2].first, mappings1[i1].first, mappings2[current_i2].first) ;
#endif
        int current_sum_errors = mappings1[i1].first + mappings2[current_i2].first;
        if (current_sum_errors < *min_sum_errors) {
@@ -1938,10 +1937,12 @@ void Chromap<MappingRecord>::MergeCandidates(std::vector<Candidate> &c1, const s
	size2 = c2.size();
	buffer.clear();

	/*for (i = 0 ; i < size1 ; ++i)
#ifdef LI_DEBUG
	for (i = 0 ; i < size1 ; ++i)
		printf("c1: %d %d %d\n", (int)(c1[i].position >> 32), (int)c1[i].position, c1[i].count);
	for (i = 0 ; i < size2 ; ++i)
		printf("c2: %d %d %d\n", (int)(c2[i].position >> 32), (int)c2[i].position, c2[i].count);*/
		printf("c2: %d %d %d\n", (int)(c2[i].position >> 32), (int)c2[i].position, c2[i].count);
#endif
	i = 0; j = 0;
	while (i < size1 && j < size2) {
		if (c1[i].position == c2[j].position) {
@@ -2707,6 +2708,7 @@ uint8_t Chromap<MappingRecord>::GetMAPQForSingleEndRead(int error_threshold, int
    mapq = 5 * 6.02 * (second_min_num_errors - min_num_errors) * tmp * tmp + 0.499;
    //mapq = 30 - 34.0 / error_threshold + 34.0 / error_threshold * (second_min_num_errors - min_num_errors) * tmp * tmp + 0.499;
  }
  //printf("%d %d %d\n", mapq);
  if (num_second_best_mappings > 0) {
    mapq -= (int)(4.343 * log(num_second_best_mappings + 1) + 0.499);
  }
+9 −6
Original line number Diff line number Diff line
@@ -374,7 +374,7 @@ void Index::GenerateCandidatesOnOneDirection(int error_threshold, int num_seeds_
}

// Return the number of repetitive seeds
int Index::CollectCandidates(int max_seed_frequency, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, uint32_t *repetitive_seed_length, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits, bool use_heap) {
int Index::CollectCandidates(int max_seed_frequency, int repetitive_seed_frequency, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, uint32_t *repetitive_seed_length, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits, bool use_heap) {
  uint32_t num_minimizers = minimizers.size();
  int repetitive_seed_count = 0 ;
  std::vector<uint64_t> *mm_positive_hits = NULL, *mm_negative_hits = NULL;
@@ -440,7 +440,9 @@ int Index::CollectCandidates(int max_seed_frequency, const std::vector<std::pair
	      negative_hits->push_back(candidate);
          }
        } 
      } else {
      } 
      
      if (num_occurrences >= (uint32_t)repetitive_seed_frequency){
        if (previous_repetitive_seed_position > read_position) { // first minimizer
          *repetitive_seed_length += kmer_size_ + window_size_ - 1;
        } else {
@@ -546,6 +548,7 @@ void Index::GenerateCandidatesFromRepetitiveReadWithMateInfo(int error_threshold
  if (best_candidate_num != 1 || max_count < min_num_seeds_required_for_mapping_) 
  	return;

  *repetitive_seed_length = 0 ;
  for (uint32_t mi = 0; mi < num_minimizers; ++mi) {
    khiter_t khash_iterator = kh_get(k64, lookup_table_, minimizers[mi].first << 1);
    if (khash_iterator == kh_end(lookup_table_)) {
@@ -610,7 +613,7 @@ void Index::GenerateCandidatesFromRepetitiveReadWithMateInfo(int error_threshold
        }
      }  

      if (num_occurrences >= (uint32_t)max_seed_frequencies_[1]) {
      if (num_occurrences >= (uint32_t)max_seed_frequencies_[0]) {
        if (previous_repetitive_seed_position > read_position) { // first minimizer
          *repetitive_seed_length += kmer_size_ + window_size_ - 1;
        } else {
@@ -635,12 +638,12 @@ void Index::GenerateCandidatesFromRepetitiveReadWithMateInfo(int error_threshold
void Index::GenerateCandidates(int error_threshold, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, uint32_t *repetitive_seed_length, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits, std::vector<Candidate> *positive_candidates, std::vector<Candidate> *negative_candidates) {
  *repetitive_seed_length = 0;
  bool recollect = true;
  int repetitive_seed_count = CollectCandidates(max_seed_frequencies_[0], minimizers, repetitive_seed_length, positive_hits, negative_hits, false);
  int repetitive_seed_count = CollectCandidates(max_seed_frequencies_[0], max_seed_frequencies_[0], minimizers, repetitive_seed_length, positive_hits, negative_hits, false);
  if (repetitive_seed_count > (int)minimizers.size() / 2 && minimizers.size() >= 10) {
    positive_hits->clear();
    negative_hits->clear();
    *repetitive_seed_length = 0;
    repetitive_seed_count = CollectCandidates(max_seed_frequencies_[1], minimizers, repetitive_seed_length, positive_hits, negative_hits, true);
    repetitive_seed_count = CollectCandidates(max_seed_frequencies_[1], max_seed_frequencies_[0], minimizers, repetitive_seed_length, positive_hits, negative_hits, true);
    recollect = false;
  }

@@ -655,7 +658,7 @@ void Index::GenerateCandidates(int error_threshold, const std::vector<std::pair<
    negative_hits->clear();
    //printf("second round\n") ;
    *repetitive_seed_length = 0;
    CollectCandidates(max_seed_frequencies_[1], minimizers, repetitive_seed_length, positive_hits, negative_hits, true);
    CollectCandidates(max_seed_frequencies_[1], max_seed_frequencies_[0], minimizers, repetitive_seed_length, positive_hits, negative_hits, true);
    //printf("p+n2: %d\n", positive_hits->size() + negative_hits->size()) ;
    GenerateCandidatesOnOneDirection(error_threshold, min_num_seeds_required_for_mapping_, positive_hits, positive_candidates);
    GenerateCandidatesOnOneDirection(error_threshold, min_num_seeds_required_for_mapping_, negative_hits, negative_candidates);
+1 −1
Original line number Diff line number Diff line
@@ -68,7 +68,7 @@ class Index {
  void GenerateCandidatesOnOneDirection(int error_threshold, int num_seeds_required, std::vector<uint64_t> *hits, std::vector<Candidate> *candidates);
  void GenerateCandidates(int error_threshold, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, uint32_t *repetitive_seed_length, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits, std::vector<Candidate> *positive_candidates, std::vector<Candidate> *negative_candidates);
  void GenerateCandidatesFromRepetitiveReadWithMateInfo(int error_threshold, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, uint32_t *repetitive_seed_length, std::vector<uint64_t> *hits, std::vector<Candidate> *candidates, std::vector<Candidate> *mate_candidates, int direction, unsigned int range);
  int CollectCandidates(int max_seed_frequency, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, uint32_t *repetitive_seed_length, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits, bool use_heap);
  int CollectCandidates(int max_seed_frequency, int repetitive_seed_frequency, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, uint32_t *repetitive_seed_length, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits, bool use_heap);
  inline static uint64_t Hash64(uint64_t key, const uint64_t mask) {
    key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1;
    key = key ^ key >> 24;