Commit d0334577 authored by Li's avatar Li Committed by Haowen Zhang
Browse files

Do not cache the alignments near the beginning of a chromosome. Adjust the...

Do not cache the alignments near the beginning of a chromosome. Adjust the cache update frequency to be indepdent of thread number.
parent 0ccd2c7a
Loading
Loading
Loading
Loading
+7 −21
Original line number Diff line number Diff line
@@ -353,10 +353,7 @@ void Chromap::MapSingleEndReads() {
                    mapping_metadata);
              }
              
              if (read_index < num_loaded_reads &&
                  (read_index <
                       num_loaded_reads / mapping_parameters_.num_threads ||
                   num_reads_ <= 2500000)) {
              if (read_index < mm_to_candidates_cache.GetUpdateThreshold(num_loaded_reads, num_reads_, false)) {
                mm_history[read_index].timestamp = num_reads_;
                mm_history[read_index].minimizers =
                    mapping_metadata.minimizers_;
@@ -399,13 +396,9 @@ void Chromap::MapSingleEndReads() {
            }
          }
#pragma omp taskwait
          for (uint32_t read_index = 0; read_index < num_loaded_reads;
          for (uint32_t read_index = 0; 
              read_index < mm_to_candidates_cache.GetUpdateThreshold(num_loaded_reads, num_reads_, false);
               ++read_index) {
            if (num_reads_ > 2500000 &&
                read_index >=
                    num_loaded_reads / mapping_parameters_.num_threads) {
              break;
            }
            if (mm_history[read_index].timestamp != num_reads_) continue;
            mm_to_candidates_cache.Update(
                mm_history[read_index].minimizers,
@@ -770,10 +763,7 @@ void Chromap::MapPairedEndReads() {
                    paired_end_mapping_metadata.mapping_metadata2_
                        .GetNumCandidates();

                if (pair_index < num_loaded_pairs &&
                    (pair_index <
                         num_loaded_pairs / mapping_parameters_.num_threads ||
                     num_reads_ <= 5000000)) {
                if (pair_index < mm_to_candidates_cache.GetUpdateThreshold(num_loaded_pairs, num_reads_, true)) {
                  mm_history1[pair_index].timestamp =
                      mm_history2[pair_index].timestamp = num_reads_;
                  mm_history1[pair_index].minimizers =
@@ -931,13 +921,9 @@ void Chromap::MapPairedEndReads() {
          //}
#pragma omp taskwait
          // Update cache
          for (uint32_t pair_index = 0; pair_index < num_loaded_pairs;
          for (uint32_t pair_index = 0; 
              pair_index < mm_to_candidates_cache.GetUpdateThreshold(num_loaded_pairs, num_reads_, true);
               ++pair_index) {
            if (num_reads_ > 5000000 &&
                pair_index >=
                    num_loaded_pairs / mapping_parameters_.num_threads) {
              break;
            }
            if (mm_history1[pair_index].timestamp != num_reads_) continue;

            mm_to_candidates_cache.Update(
+33 −8
Original line number Diff line number Diff line
@@ -161,6 +161,7 @@ class mm_cache {
        neg_candidates[i].position =
            cache[hidx].positive_candidates[i].position - shift + read_len - 1;
      repetitive_seed_length = cache[hidx].repetitive_seed_length;

      return hidx;
    } else {
      return -1;
@@ -236,7 +237,25 @@ if (cache[hidx].finger_print_cnt_sum <= 5)
        cache[hidx].strands.resize(0);
        return;
      }

      int size = pos_candidates.size();
      int shift = (int)minimizers[0].second >> 1;
      // Do not cache if it is too near the start.
      for (i = 0; i < size; ++i) 
        if ((int)pos_candidates[i].position < 0) {
          cache[hidx].offsets.resize(0);
          cache[hidx].strands.resize(0);
          cache[hidx].minimizers.resize(0);
          return;
        }
      size = neg_candidates.size();
      for (i = 0; i < size; ++i)
        if ((int)neg_candidates[i].position - ((int)minimizers[msize - 1].second>>1)
            < kmer_length + shift) {
          cache[hidx].offsets.resize(0);
          cache[hidx].strands.resize(0);
          cache[hidx].minimizers.resize(0);
          return;
        }
      cache[hidx].offsets.resize(msize - 1);
      cache[hidx].strands.resize(msize);
      for (i = 0; i < msize; ++i) {
@@ -254,13 +273,9 @@ if (cache[hidx].finger_print_cnt_sum <= 5)
      cache[hidx].repetitive_seed_length = repetitive_seed_length;

      // adjust the candidate position.
      int size = cache[hidx].positive_candidates.size();
      int shift = (int)minimizers[0].second >> 1;
      for (i = 0; i < size; ++i) {
        uint64_t rid = (int)(cache[hidx].positive_candidates[i].position >> 32);
        int rpos = (int)cache[hidx].positive_candidates[i].position;
        cache[hidx].positive_candidates[i].position = (rid << 32) + (uint32_t)(rpos + shift);
      }
      size = cache[hidx].positive_candidates.size();
      for (i = 0; i < size; ++i) 
        cache[hidx].positive_candidates[i].position += shift;
      size = cache[hidx].negative_candidates.size();
      for (i = 0; i < size; ++i)
        cache[hidx].negative_candidates[i].position -= shift;
@@ -288,6 +303,16 @@ if (cache[hidx].finger_print_cnt_sum <= 5)
    return ret;
  }

  // How many reads from a batch we want to use to update the cache. 
  // paired end data has twice the amount reads, so the threshold is lower
  uint32_t GetUpdateThreshold(uint32_t num_loaded_reads, uint64_t num_reads, bool paired) {
    const uint32_t block = paired ? 2500000 : 5000000;
    if (num_reads <= block)
      return num_loaded_reads;
    else
      return num_loaded_reads / (8 * (num_reads / block));  
  }

  void PrintStats() {
    for (int i = 0; i < cache_size; ++i) {
      printf("%d %d %d %d ", cache[i].weight, cache[i].finger_print_cnt_sum,