Unverified Commit 860b7044 authored by Haowen Zhang's avatar Haowen Zhang Committed by GitHub
Browse files

Merge pull request #28 from haowenz/li_dev2

Reduce candidate cache overhead.
parents f1cf768b 0293ab53
Loading
Loading
Loading
Loading
+21 −13
Original line number Diff line number Diff line
@@ -1172,7 +1172,8 @@ void Chromap<MappingRecord>::MapPairedEndReads() {
                //}
                //std::cerr << "After generation, #pc1: " << positive_candidates1.size() << ", #nc1: " << negative_candidates1.size() << ", #pc2: " << positive_candidates2.size() << ", #nc2: " << negative_candidates2.size() << "\n";
                uint32_t current_num_candidates2 = positive_candidates2.size() + negative_candidates2.size();
                if (pair_index < num_loaded_pairs / 2 && (pair_index < num_loaded_pairs / num_threads_ || num_reads_ < 2 * 5000000)) {
                if (pair_index < num_loaded_pairs && (pair_index < num_loaded_pairs / num_threads_ || num_reads_ <= 5000000)) {
									mm_history1[pair_index].timestamp = mm_history2[pair_index].timestamp = num_reads_;
                  mm_history1[pair_index].minimizers = minimizers1;
                  mm_history1[pair_index].positive_candidates = positive_candidates1;
                  mm_history1[pair_index].negative_candidates = negative_candidates1;
@@ -1327,22 +1328,26 @@ void Chromap<MappingRecord>::MapPairedEndReads() {
          //  }
          //}
          // Update cache
          for (uint32_t pair_index = 0; pair_index < num_loaded_pairs / 2; ++pair_index) {
            if (num_reads_ >= 2 * 5000000 && pair_index >= num_loaded_pairs / num_threads_) {
          for (uint32_t pair_index = 0; pair_index < num_loaded_pairs; ++pair_index) {
            if (num_reads_ > 5000000 && pair_index >= num_loaded_pairs / num_threads_) {
              break;
            }
            if (mm_history1[pair_index].timestamp != num_reads_)
              continue ;

            mm_to_candidates_cache.Update(mm_history1[pair_index].minimizers, mm_history1[pair_index].positive_candidates, mm_history1[pair_index].negative_candidates, mm_history1[pair_index].repetitive_seed_length);
            mm_to_candidates_cache.Update(mm_history2[pair_index].minimizers, mm_history2[pair_index].positive_candidates, mm_history2[pair_index].negative_candidates, mm_history2[pair_index].repetitive_seed_length);
            if (mm_history1[pair_index].positive_candidates.size() < mm_history1[pair_index].positive_candidates.capacity() / 2) {

            if (mm_history1[pair_index].positive_candidates.size() > 50) {
              std::vector<Candidate>().swap(mm_history1[pair_index].positive_candidates);
            }
            if (mm_history1[pair_index].negative_candidates.size() < mm_history1[pair_index].negative_candidates.capacity() / 2) {
            if (mm_history1[pair_index].negative_candidates.size() > 50) {
              std::vector<Candidate>().swap(mm_history1[pair_index].negative_candidates);
            }
            if (mm_history2[pair_index].positive_candidates.size() < mm_history2[pair_index].positive_candidates.capacity() / 2) {
            if (mm_history2[pair_index].positive_candidates.size() > 50) {
              std::vector<Candidate>().swap(mm_history2[pair_index].positive_candidates);
            }
            if (mm_history2[pair_index].negative_candidates.size() < mm_history2[pair_index].negative_candidates.capacity() / 2) {
            if (mm_history2[pair_index].negative_candidates.size() > 50) {
              std::vector<Candidate>().swap(mm_history2[pair_index].negative_candidates);
            }
          }
@@ -2031,7 +2036,8 @@ void Chromap<MappingRecord>::MapSingleEndReads() {
              if (mm_to_candidates_cache.Query(minimizers, positive_candidates, negative_candidates, repetitive_seed_length, read_batch.GetSequenceLengthAt(read_index)) == -1) {
                index.GenerateCandidates(error_threshold_, minimizers, &repetitive_seed_length, &positive_hits, &negative_hits, &positive_candidates, &negative_candidates);
              }
              if (read_index < num_loaded_reads / 2 && (read_index <  num_loaded_reads / num_threads_ || num_reads_ < 5000000)) {
              if (read_index < num_loaded_reads && (read_index <  num_loaded_reads / num_threads_ || num_reads_ <= 2500000)) {
                mm_history[read_index].timestamp = num_reads_;
                mm_history[read_index].minimizers = minimizers;
                mm_history[read_index].positive_candidates = positive_candidates;
                mm_history[read_index].negative_candidates = negative_candidates;
@@ -2059,10 +2065,12 @@ void Chromap<MappingRecord>::MapSingleEndReads() {
              }
            }
          }
          for (uint32_t read_index = 0; read_index < num_loaded_reads / 2; ++read_index) {
            if (num_reads_ >= 5000000 && read_index >= num_loaded_reads / num_threads_) {
          for (uint32_t read_index = 0; read_index < num_loaded_reads ; ++read_index) {
            if (num_reads_ > 2500000 && read_index >= num_loaded_reads / num_threads_) {
              break;
            }
            if (mm_history[read_index].timestamp != num_reads_)
              continue;
            mm_to_candidates_cache.Update(mm_history[read_index].minimizers, mm_history[read_index].positive_candidates, mm_history[read_index].negative_candidates, mm_history[read_index].repetitive_seed_length);
            if (mm_history[read_index].positive_candidates.size() < mm_history[read_index].positive_candidates.capacity() / 2) {
              std::vector<Candidate>().swap(mm_history[read_index].positive_candidates);
+1 −1
Original line number Diff line number Diff line
@@ -31,7 +31,7 @@ struct StackCell {
};

struct _mm_history {
	bool skip;
	unsigned int timestamp;
	std::vector<std::pair<uint64_t, uint64_t> > minimizers;
	std::vector<Candidate> positive_candidates;
	std::vector<Candidate> negative_candidates;
+106 −136
Original line number Diff line number Diff line
@@ -5,113 +5,21 @@

#define FINGER_PRINT_SIZE 103

namespace chromap {

class mm_cache_candidate_list {
private:
  uint32_t *positions;
  uint8_t *counts; // 0: the corresponding position is chr id. Otherwise, it is coordinate within the chr id
  uint32_t size;
  uint32_t actual_size;

  void Clear() {
    if (positions != NULL)
    {
      delete[] positions;
      positions = NULL;
    }
    if (counts != NULL)
    {
      delete[] counts;
      counts = NULL;
    }
    size = actual_size = 0;
  }
public:
  mm_cache_candidate_list(){
    positions = NULL;
    counts = NULL;
    size = actual_size = 0;
  }

  ~mm_cache_candidate_list() {
    Clear();
  }
  
  void Input(const std::vector<Candidate> &candidates) {
    Clear();
    int i, k;
    actual_size = candidates.size();
    if (actual_size == 0)
      return;
    size = actual_size + 1;
    
    // Collect the extra size introduced by the chr.
    for (i = 1; i < (int)actual_size; ++i) {
      if ((candidates[i].position >> 32) != (candidates[i - 1].position >> 32))
        ++size;
    }
    positions = new uint32_t[size];
    counts = new uint8_t[size];

    k = 0;
    for (i = 0; i < (int)actual_size; ++i) {
      if (i == 0 || (candidates[i].position >> 32) != (candidates[i - 1].position >> 32)) {
        counts[k] = 0;
	positions[k] = (uint32_t)(candidates[i].position >> 32);
	++k;
      }
      counts[k] = candidates[i].count;
      if (counts[k] == 0) // may happen on overflow for the tandem repeats read
        counts[k] = 1;
      positions[k] = (uint32_t)candidates[i].position;
      ++k;
    }
  }

  void Output(std::vector<Candidate> &candidates, int shift) {
    candidates.resize(actual_size);
    int i, k;
    k = 0;
    uint64_t rid = 0;
    for (i = 0; i < (int)size; ++i) {
     if (counts[i] == 0) {
       rid = (uint64_t)positions[i] << 32ull ;
     } else {
       candidates[k].position = rid | (uint32_t)(positions[i] + shift);
       candidates[k].count = counts[i];
       ++k;
     }
    }
  }

  void Shift(int offset) {
    int i;
    for (i = 0; i < (int)size; ++i) {
      if (counts[i] != 0)
        positions[i] += offset;
    }
  }

  uint32_t GetActualSize() {
    return actual_size;
  }
} ;
#define HEAD_MM_ARRAY_SIZE 4194304 //2^22
#define HEAD_MM_ARRAY_MASK 0x3fffff // 22 positions

namespace chromap {
struct _mm_cache_entry {
  std::vector<uint64_t> minimizers;
  std::vector<int> offsets; // the distance to the next minimizer
  std::vector<uint8_t> strands;
  //std::vector<Candidate> positive_candidates;
  //std::vector<Candidate> negative_candidates;
 
  mm_cache_candidate_list positive_candidate_list ;
  mm_cache_candidate_list negative_candidate_list ;

  std::vector<Candidate> positive_candidates;
  std::vector<Candidate> negative_candidates;
  int weight;
  unsigned short finger_print_cnt[FINGER_PRINT_SIZE];
  int finger_print_cnt_sum;
  uint32_t repetitive_seed_length;
	int activated;
};

class mm_cache {
@@ -120,6 +28,8 @@ class mm_cache {
  struct _mm_cache_entry *cache;
  int kmer_length;
  int update_limit;
	int saturate_count;
  uint64_t *head_mm; // the first and last minimizer for each cached minimizer vector

  // 0: not match. -1: opposite order. 1: same order
  int IsMinimizersMatchCache(const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, const struct _mm_cache_entry &cache) {
@@ -164,17 +74,21 @@ class mm_cache {
 public:
  mm_cache(int size) {
    cache = new struct _mm_cache_entry[size];
		head_mm = new uint64_t[HEAD_MM_ARRAY_SIZE];
    cache_size = size;
    //memset(cache, 0, sizeof(cache[0]) * size) ;
    for (int i = 0; i < size; ++i) {
      cache[i].weight = 0;
      memset(cache[i].finger_print_cnt, 0, sizeof(unsigned short) * FINGER_PRINT_SIZE);
      cache[i].finger_print_cnt_sum = 0;
			cache[i].activated = 0;
    }
    update_limit = 10;
		saturate_count = 100;
  }
  ~mm_cache() {
    delete[] cache;
		delete[] head_mm;
  }

  void SetKmerLength(int kl) {
@@ -187,23 +101,43 @@ class mm_cache {
    int msize = minimizers.size();
		if (msize == 0)
			return -1;
		if ((head_mm[(minimizers[0].first>>6)&HEAD_MM_ARRAY_MASK] & (1ull<<(minimizers[0].first & 0x3f))) == 0)
			return -1;
    uint64_t h = 0;
    for (i = 0 ; i < msize; ++i)
      h += (minimizers[i].first);
    //for (i = 0 ; i < msize; ++i)
    //  h += (minimizers[i].first);
		if (msize == 1) {
			h = (minimizers[0].first) ;
		} else {
			h = minimizers[0].first + minimizers[msize - 1].first;
		}

    int hidx = h % cache_size;
    int direction = IsMinimizersMatchCache(minimizers, cache[hidx]);
    if (direction == 1) {
      int shift = (int)minimizers[0].second >> 1;
      cache[hidx].positive_candidate_list.Output(pos_candidates, -shift);
      cache[hidx].negative_candidate_list.Output(neg_candidates, shift);
      pos_candidates = cache[hidx].positive_candidates;
      neg_candidates = cache[hidx].negative_candidates;
      repetitive_seed_length = cache[hidx].repetitive_seed_length;
      int size = pos_candidates.size();
      int shift = (int)minimizers[0].second >> 1;
      for (i = 0; i < size; ++i)
        pos_candidates[i].position -= shift;
      size = neg_candidates.size();
      for (i = 0; i < size; ++i)
        neg_candidates[i].position += shift;
      return hidx;
    } else if (direction == -1) {// The "read" is on the other direction of the cached "read"
      int size = cache[hidx].negative_candidates.size();
      // Start position of the last minimizer shoud equal the first minimizer's end position in rc "read".
      int shift = read_len - ((int)minimizers[msize - 1].second>>1) - 1 + kmer_length - 1; 

      cache[hidx].negative_candidate_list.Output(pos_candidates, shift - read_len + 1);
      cache[hidx].positive_candidate_list.Output(neg_candidates, -shift + read_len - 1);
      pos_candidates = cache[hidx].negative_candidates;
      for (i = 0; i < size; ++i)
        pos_candidates[i].position = cache[hidx].negative_candidates[i].position + shift - read_len + 1;
      size = cache[hidx].positive_candidates.size();
      neg_candidates = cache[hidx].positive_candidates;
      for (i = 0; i < size; ++i)
        neg_candidates[i].position = cache[hidx].positive_candidates[i].position - shift + read_len - 1;
      repetitive_seed_length = cache[hidx].repetitive_seed_length;
      return hidx;
    } else {
@@ -214,30 +148,59 @@ class mm_cache {
  void Update(const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, std::vector<Candidate> &pos_candidates, std::vector<Candidate> &neg_candidates, uint32_t repetitive_seed_length) {
    int i;
    int msize = minimizers.size();
    if (msize == 0)
      return;

    uint64_t h = 0; // for hash
    uint64_t f = 0; // for finger printing
    for (i = 0; i < msize; ++i) {
    /*for (i = 0; i < msize; ++i) {
      h += (minimizers[i].first);	
      f ^= (minimizers[i].first);
    }*/
		if (msize == 0)
			return;
		else if (msize == 1) {
			h = f = (minimizers[0].first) ;
		} else {
			h = minimizers[0].first + minimizers[msize - 1].first;
			f = minimizers[0].first ^ minimizers[msize - 1].first;
		}
    int hidx = h % cache_size;
    int finger_print = f % FINGER_PRINT_SIZE; 

    ++cache[hidx].finger_print_cnt[finger_print];
    ++cache[hidx].finger_print_cnt_sum;
		if (cache[hidx].finger_print_cnt_sum > saturate_count)
			return;
		
		/*int rate = 2;
    if (cache[hidx].finger_print_cnt_sum <= 2)
			rate = 0;
		else if (cache[hidx].finger_print_cnt_sum < 5)
			rate = 2;
		else if (cache[hidx].finger_print_cnt_sum < 10)
			rate = 4;
		else
			rate = 5;*/

		/*int rate = 2;
    if (cache[hidx].finger_print_cnt_sum <= 5)
			rate = 0;
		else if (cache[hidx].finger_print_cnt_sum < 10)
			rate = 3;
		else
			rate = 5;*/

    if (cache[hidx].finger_print_cnt_sum < 10 || (int)cache[hidx].finger_print_cnt[finger_print] * 5 < cache[hidx].finger_print_cnt_sum) {
      return;
    }

    /*if ((int)cache[hidx].finger_print_cnt[finger_print] * rate < cache[hidx].finger_print_cnt_sum) {
			return ;
		}*/
    int direction = IsMinimizersMatchCache(minimizers, cache[hidx]);
    if (direction != 0)
      ++cache[hidx].weight;
    else
      --cache[hidx].weight;

		cache[hidx].activated = 1;
    // Renew the cache
    if (cache[hidx].weight < 0) {
      cache[hidx].weight = 1;
@@ -250,26 +213,31 @@ class mm_cache {

      cache[hidx].offsets.resize(msize - 1);
      cache[hidx].strands.resize(msize);
      for (i = 0; i < msize; ++i)
      {
      for (i = 0; i < msize; ++i) {
        cache[hidx].minimizers[i] = minimizers[i].first;
        cache[hidx].strands[i] = (minimizers[i].second & 1);
			}
      for (i = 0; i < msize - 1; ++i) {
        cache[hidx].offsets[i] = ((int)minimizers[i + 1].second>>1) - ((int)minimizers[i].second>>1);
      }
      /*std::vector<Candidate>().swap(cache[hidx].positive_candidates);
      std::vector<Candidate>().swap(cache[hidx].positive_candidates);
      std::vector<Candidate>().swap(cache[hidx].negative_candidates);
      cache[hidx].positive_candidates = pos_candidates;
      cache[hidx].negative_candidates = neg_candidates;*/
      cache[hidx].positive_candidate_list.Input(pos_candidates);
      cache[hidx].negative_candidate_list.Input(neg_candidates);
      cache[hidx].negative_candidates = neg_candidates;
      cache[hidx].repetitive_seed_length = repetitive_seed_length;

      // adjust the candidate position.
      int size = cache[hidx].positive_candidates.size();
      int shift = (int)minimizers[0].second>>1;
      cache[hidx].positive_candidate_list.Shift(shift);
      cache[hidx].negative_candidate_list.Shift(-shift);
      for (i = 0; i < size; ++i)
        cache[hidx].positive_candidates[i].position += shift;
      size = cache[hidx].negative_candidates.size();
      for (i = 0; i < size; ++i)
        cache[hidx].negative_candidates[i].position -= shift;
			
			// Update head mm array
			head_mm[(minimizers[0].first>>6) & HEAD_MM_ARRAY_MASK] |= (1ull<<(minimizers[0].first & 0x3f)) ;
			head_mm[(minimizers[msize - 1].first>>6) & HEAD_MM_ARRAY_MASK] |= (1ull<<(minimizers[msize - 1].first & 0x3f)) ;
    }
  }

@@ -278,14 +246,13 @@ class mm_cache {
  }

  uint64_t GetMemoryBytes() {
    return 0;
    int i;
    uint64_t ret = 0;
    for (i = 0; i < cache_size; ++i) {
      ret += sizeof(cache[i]) + cache[i].minimizers.capacity() * sizeof(uint64_t) ;
        //+ cache[i].offsets.capacity() * sizeof(int)
        //+ cache[i].positive_candidates.capacity() * sizeof(Candidate) 
        //+ cache[i].negative_candidates.capacity() * sizeof(Candidate);
      ret += sizeof(cache[i]) + cache[i].minimizers.capacity() * sizeof(uint64_t) 
        + cache[i].offsets.capacity() * sizeof(int)
        + cache[i].positive_candidates.capacity() * sizeof(Candidate) 
        + cache[i].negative_candidates.capacity() * sizeof(Candidate);
    }
    return ret;
  }
@@ -293,13 +260,16 @@ class mm_cache {
  void PrintStats() {
    for (int i = 0 ; i < cache_size ; ++i)
    {
      printf("%d %d %u ", cache[i].weight, cache[i].finger_print_cnt_sum, cache[i].positive_candidate_list.GetActualSize() + 
      			cache[i].negative_candidate_list.GetActualSize()) ;
      printf("%d %d %d %d ", cache[i].weight, cache[i].finger_print_cnt_sum, int(cache[i].positive_candidates.size() + 
      			cache[i].negative_candidates.size()), cache[i].activated) ;
      int tmp = 0;
      for (int j = 0 ; j < FINGER_PRINT_SIZE ; ++j)
        if (cache[i].finger_print_cnt[j] > tmp)
          tmp = cache[i].finger_print_cnt[j] ;
      printf("%d\n", tmp);
      printf("%d", tmp);
      for (int j = 0 ; j < FINGER_PRINT_SIZE ; ++j)
				printf(" %u", cache[i].finger_print_cnt[j]) ;
			printf("\n") ;
		}
  }
};