Commit 8b31bade authored by Li's avatar Li
Browse files

Better rule for stop early heuristic. Try to redo the collect seeds if too many repetitive seeds

parent 914ab5de
Loading
Loading
Loading
Loading
+23 −4
Original line number Diff line number Diff line
@@ -944,6 +944,12 @@ void Chromap<MappingRecord>::ReduceCandidatesForPairedEndReadOnOneDirection(cons
  uint32_t i2 = 0;
  uint32_t mapping_positions_distance = max_insert_size_;
  uint32_t previous_end_i2 = i2;
#ifdef LI_DEBUG
  for (int i = 0 ; i < candidates1.size(); ++i)
    printf("%s 0: %d %d:%d\n", __func__, i, (int)(candidates1[i].position>>32), (int)candidates1[i].position) ;
  for (int i = 0 ; i < candidates2.size(); ++i)
    printf("%s 1: %d %d:%d\n", __func__, i, (int)(candidates2[i].position>>32), (int)candidates2[i].position) ;
#endif
  while (i1 < candidates1.size() && i2 < candidates2.size()) {
    if (candidates1[i1].position > candidates2[i2].position + mapping_positions_distance) {
      ++i2;
@@ -1043,6 +1049,12 @@ void Chromap<MappingRecord>::GenerateBestMappingsForPairedEndReadOnOneDirection(
  uint32_t min_overlap_length = min_read_length_;
  uint32_t read1_length = read_batch1.GetSequenceLengthAt(pair_index);
  uint32_t read2_length = read_batch2.GetSequenceLengthAt(pair_index);
#ifdef LI_DEBUG
  for (int i = 0; i < mappings1.size(); ++i) 
    printf("mappings1 %d %d:%d\n", i, (int)(mappings1[i].second>>32), (int)mappings1[i].second) ;
  for (int i = 0; i < mappings1.size(); ++i) 
    printf("mappings2 %d %d:%d\n", i, (int)(mappings2[i].second>>32), (int)mappings2[i].second) ;
#endif
  while (i1 < mappings1.size() && i2 < mappings2.size()) {
    if ((first_read_direction == kNegative && mappings1[i1].second > mappings2[i2].second + max_insert_size_ - read1_length) || (first_read_direction == kPositive && mappings1[i1].second > mappings2[i2].second + read2_length - min_overlap_length)) {
      ++i2;
@@ -1053,6 +1065,9 @@ void Chromap<MappingRecord>::GenerateBestMappingsForPairedEndReadOnOneDirection(
      // then we go back and then move to next pi1 and keep doing the similar thing. 
      uint32_t current_i2 = i2;
      while (current_i2 < mappings2.size() && ((first_read_direction == kPositive && mappings2[current_i2].second <= mappings1[i1].second + max_insert_size_ - read2_length) || (first_read_direction == kNegative && mappings2[current_i2].second <= mappings1[i1].second + read1_length - min_overlap_length))) {
#ifdef LI_DEBUG 
	printf("%s: %llu %d %llu %d\n", __func__, mappings1[i1].second >> 32, int(mappings1[i1].second), mappings2[i2].second>>32, int(mappings2[i2].second)) ;
#endif
        int current_sum_errors = mappings1[i1].first + mappings2[current_i2].first;
        if (current_sum_errors < *min_sum_errors) {
          *second_min_sum_errors = *min_sum_errors;
@@ -1949,7 +1964,7 @@ void Chromap<MappingRecord>::VerifyCandidatesOnOneDirectionUsingSIMD(Direction c
      valid_candidate_index = 0;
      // Check whether we should stop early. Assuming the candidates are sorted 
      if (GetMAPQForSingleEndRead(error_threshold_, num_candidates, 0, read_length + error_threshold_, *min_num_errors, *num_best_mappings, *second_min_num_errors, *num_second_best_mappings) == 0)
        break;
        candidate_count_threshold = candidates[candidate_index].count - 1;
    }
    ++candidate_index;
  }
@@ -2051,14 +2066,18 @@ void Chromap<MappingRecord>::VerifyCandidates(const SequenceBatch &read_batch, u
  int maxTag = 0;
  //printf("LI_DEBUG: %u %u\n", positive_candidates.size() + negative_candidates.size(), minimizers.size()) ;
  for (i = 0; i < positive_candidates.size(); ++i) {
    //printf("+ %u %u\n", i, positive_candidates[i].mmCnt) ;
#ifdef LI_DEBUG
    printf("%s + %u %u %d:%d\n", __func__, i, positive_candidates[i].count, (int)(positive_candidates[i].position>>32), (int)positive_candidates[i].position) ;
#endif
    if (positive_candidates[i].count == minimizers.size()) {
      maxTag = i << 1;
      ++maxCnt;
    }
  }
  for (i = 0; i < negative_candidates.size(); ++i) {
    //printf("- %u %u\n", i, negative_candidates[i].mmCnt) ;
#ifdef LI_DEBUG
    printf("%s - %u %u %d:%d\n", __func__, i, negative_candidates[i].count, (int)(negative_candidates[i].position>>32), (int)negative_candidates[i].position) ;
#endif
    if (negative_candidates[i].count == minimizers.size()) {
      maxTag = (i << 1)|1;
      ++maxCnt;
+110 −9
Original line number Diff line number Diff line
@@ -8,6 +8,14 @@

namespace chromap {

struct mmHit {
	uint32_t mi ;
	uint64_t position ;
	bool operator<(const mmHit &h)  const {
		return position > h.position; // the inversed direction is to make a min-heap
	}
} ;

void Index::Statistics(uint32_t num_sequences, const SequenceBatch &reference) {
  double real_start_time = Chromap<>::GetRealTime();
  int n = 0, n1 = 0;
@@ -314,7 +322,7 @@ void Index::GenerateCandidatesOnOneDirection(int error_threshold, std::vector<ui
  //std::cerr << "Direction\n";
  hits->emplace_back(UINT64_MAX);
  if (hits->size() > 0) {
    std::sort(hits->begin(), hits->end());
    //std::sort(hits->begin(), hits->end());
    int count = 1;
    uint64_t previous_hit = (*hits)[0];
    uint32_t previous_reference_id = previous_hit >> 32;
@@ -322,6 +330,9 @@ void Index::GenerateCandidatesOnOneDirection(int error_threshold, std::vector<ui
    for (uint32_t pi = 1; pi < hits->size(); ++pi) {
      uint32_t current_reference_id = (*hits)[pi] >> 32;
      uint32_t current_reference_position = (*hits)[pi];
#ifdef LI_DEBUG
      printf("%s: %d %d\n", __func__, current_reference_id, current_reference_position);
#endif
      if (current_reference_id != previous_reference_id || current_reference_position > previous_reference_position + error_threshold) {
        if (count >= min_num_seeds_required_for_mapping_) {
          Candidate nc;
@@ -341,8 +352,15 @@ void Index::GenerateCandidatesOnOneDirection(int error_threshold, std::vector<ui
  }
}

void Index::CollectCandidates(int max_seed_frequency, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, uint32_t *repetitive_seed_length, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits) {
// Return the number of repetitive seeds
int Index::CollectCandidates(int max_seed_frequency, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, uint32_t *repetitive_seed_length, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits, bool use_heap) {
  uint32_t num_minimizers = minimizers.size();
  int repetitive_seed_count = 0 ;
  std::vector<uint64_t> *mm_positive_hits = NULL, *mm_negative_hits = NULL;
  if (use_heap) {
    mm_positive_hits = new std::vector<uint64_t>[num_minimizers] ;
    mm_negative_hits = new std::vector<uint64_t>[num_minimizers] ;
  }
  positive_hits->reserve(max_seed_frequencies_[0]);
  negative_hits->reserve(max_seed_frequencies_[0]);
  uint32_t previous_repetitive_seed_position = std::numeric_limits<uint32_t>::max();
@@ -364,15 +382,22 @@ void Index::CollectCandidates(int max_seed_frequency, const std::vector<std::pai
        // ok, for now we can't see the reference here. So let us don't do the check.
        // Instead, we do it later some time when we check the candidates.
        uint64_t candidate = (reference_id << 32) | candidate_position;
        if (use_heap)
	  mm_positive_hits[mi].push_back(candidate);
	else
	  positive_hits->push_back(candidate);
      } else {
        uint32_t candidate_position = reference_position + read_position - kmer_size_ + 1;// < reference_length ? reference_position - read_position : 0;
        uint64_t candidate = (reference_id << 32) | candidate_position;
	if (use_heap)
          mm_negative_hits[mi].push_back(candidate);
	else
	  negative_hits->push_back(candidate);
      }
    } else {
      uint32_t offset = value >> 32;
      uint32_t num_occurrences = value;
      //printf("%s: %u %u\n", __func__, offset, num_occurrences) ;
      if (num_occurrences < (uint32_t)max_seed_frequency) {
        for (uint32_t oi = 0; oi < num_occurrences; ++oi) {
          uint64_t value = occurrence_table_[offset + oi];
@@ -381,10 +406,16 @@ void Index::CollectCandidates(int max_seed_frequency, const std::vector<std::pai
          if (((minimizers[mi].second & 1) ^ (value & 1)) == 0) { // same
            uint32_t candidate_position = reference_position - read_position;
            uint64_t candidate = (reference_id << 32) | candidate_position;
	    if (use_heap)
              mm_positive_hits[mi].push_back(candidate);
	    else
	      positive_hits->push_back(candidate);
          } else {
            uint32_t candidate_position = reference_position + read_position - kmer_size_ + 1;
            uint64_t candidate = (reference_id << 32) | candidate_position;
	    if (use_heap)
              mm_negative_hits[mi].push_back(candidate);
	    else
	      negative_hits->push_back(candidate);
          }
        } 
@@ -399,9 +430,70 @@ void Index::CollectCandidates(int max_seed_frequency, const std::vector<std::pai
          }
        }
        previous_repetitive_seed_position = read_position;
        ++repetitive_seed_count ;
      }
    }
  }

  if (use_heap) {
	  std::priority_queue<struct mmHit> heap;
	  unsigned int *mm_pos = new unsigned int[num_minimizers];
	  positive_hits->clear();
	  for (uint32_t mi = 0; mi < num_minimizers; ++mi) {
		  if (mm_positive_hits[mi].size() == 0)
			  continue;
		  struct mmHit nh;
		  nh.mi = mi;
		  nh.position = mm_positive_hits[mi][0];
		  heap.push(nh);
		  mm_pos[mi] = 0;
	  }

	  while(!heap.empty()) {
		  struct mmHit top = heap.top();
		  heap.pop();
		  positive_hits->push_back(top.position) ;
		  ++mm_pos[top.mi];
		  if (mm_pos[top.mi] < mm_positive_hits[top.mi].size())
		  {
			  struct mmHit nh;
			  nh.mi = top.mi;
			  nh.position = mm_positive_hits[top.mi][mm_pos[top.mi]];
			  heap.push(nh);
		  }
	  }

	  negative_hits->clear();
	  for (uint32_t mi = 0; mi < num_minimizers; ++mi) {
		  if (mm_negative_hits[mi].size() == 0)
			  continue;
		  struct mmHit nh;
		  nh.mi = mi;
		  nh.position = mm_negative_hits[mi][0];
		  heap.push(nh);
		  mm_pos[mi] = 0;
	  }
	  while(!heap.empty()) {
		  struct mmHit top = heap.top();
		  heap.pop();
		  negative_hits->push_back(top.position) ;
		  ++mm_pos[top.mi];
		  if (mm_pos[top.mi] < mm_negative_hits[top.mi].size())
		  {
			  struct mmHit nh;
			  nh.mi = top.mi;
			  nh.position = mm_negative_hits[top.mi][mm_pos[top.mi]];
			  heap.push(nh);
		  }
	  }
	  delete[] mm_positive_hits;
	  delete[] mm_negative_hits;
	  delete[] mm_pos ;
  } else {
  	std::sort(positive_hits->begin(), positive_hits->end());
	std::sort(negative_hits->begin(), negative_hits->end());
  }
  return repetitive_seed_count ;
}

void Index::GenerateCandidatesFromRepetitiveReadWithMateInfo(int error_threshold, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, uint32_t *repetitive_seed_length, std::vector<uint64_t> *hits, std::vector<Candidate> *candidates, std::vector<Candidate> *mate_candidates, int direction, unsigned int range) // directoin: +1: positive; -1: negative
@@ -488,24 +580,33 @@ void Index::GenerateCandidatesFromRepetitiveReadWithMateInfo(int error_threshold
    }
  } // for mi

  std::sort(hits->begin(), hits->end());
  GenerateCandidatesOnOneDirection(error_threshold, hits, candidates);
  //printf("%s: %d %d\n", __func__, hits->size(), candidates->size()) ;
}

void Index::GenerateCandidates(int error_threshold, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, uint32_t *repetitive_seed_length, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits, std::vector<Candidate> *positive_candidates, std::vector<Candidate> *negative_candidates) {
  *repetitive_seed_length = 0;
  CollectCandidates(max_seed_frequencies_[0], minimizers, repetitive_seed_length, positive_hits, negative_hits);
  bool recollect = true;
  int repetitive_seed_count = CollectCandidates(max_seed_frequencies_[0], minimizers, repetitive_seed_length, positive_hits, negative_hits, false);
  if (repetitive_seed_count > (int)minimizers.size() / 2) {
    positive_hits->clear();
    negative_hits->clear();
    *repetitive_seed_length = 0;
    CollectCandidates(max_seed_frequencies_[1], minimizers, repetitive_seed_length, positive_hits, negative_hits, true);
    recollect = false;
  }
  // Now I can generate primer chain in candidates
  // Let me use sort for now, but I can use merge later.
  //printf("p+n: %d\n", positive_hits->size() + negative_hits->size()) ;
  GenerateCandidatesOnOneDirection(error_threshold, positive_hits, positive_candidates);
  GenerateCandidatesOnOneDirection(error_threshold, negative_hits, negative_candidates);
  if (positive_candidates->size() + negative_candidates->size() == 0) {
  if (positive_candidates->size() + negative_candidates->size() == 0 && recollect) {
    positive_hits->clear();
    negative_hits->clear();
    //printf("second round\n") ;
    *repetitive_seed_length = 0;
    CollectCandidates(max_seed_frequencies_[1], minimizers, repetitive_seed_length, positive_hits, negative_hits);
    CollectCandidates(max_seed_frequencies_[1], minimizers, repetitive_seed_length, positive_hits, negative_hits, true);
    //printf("p+n2: %d\n", positive_hits->size() + negative_hits->size()) ;
    GenerateCandidatesOnOneDirection(error_threshold, positive_hits, positive_candidates);
    GenerateCandidatesOnOneDirection(error_threshold, negative_hits, negative_candidates);
+4 −1
Original line number Diff line number Diff line
@@ -3,10 +3,13 @@

#include <string>
#include <vector>
#include <queue>

#include "khash.h"
#include "sequence_batch.h"

//#define LI_DEBUG

namespace chromap {
#define KHashFunctionForIndex(a) ((a)>>1)
#define KHashEqForIndex(a, b) ((a)>>1 == (b)>>1)
@@ -65,7 +68,7 @@ class Index {
  void GenerateCandidatesOnOneDirection(int error_threshold, std::vector<uint64_t> *hits, std::vector<Candidate> *candidates);
  void GenerateCandidates(int error_threshold, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, uint32_t *repetitive_seed_length, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits, std::vector<Candidate> *positive_candidates, std::vector<Candidate> *negative_candidates);
  void GenerateCandidatesFromRepetitiveReadWithMateInfo(int error_threshold, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, uint32_t *repetitive_seed_length, std::vector<uint64_t> *hits, std::vector<Candidate> *candidates, std::vector<Candidate> *mate_candidates, int direction, unsigned int range);
  void CollectCandidates(int max_seed_frequency, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, uint32_t *repetitive_seed_length, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits);
  int CollectCandidates(int max_seed_frequency, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, uint32_t *repetitive_seed_length, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits, bool use_heap);
  inline static uint64_t Hash64(uint64_t key, const uint64_t mask) {
    key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1;
    key = key ^ key >> 24;