Commit 75be4a3d authored by Li's avatar Li
Browse files

Improve the efficiency of cache

parent 24d8bd02
Loading
Loading
Loading
Loading
+32 −15
Original line number Diff line number Diff line
@@ -17,6 +17,8 @@
namespace chromap {
struct _mm_history
{
	bool skip ;

	std::vector<std::pair<uint64_t, uint64_t> > minimizers ;
	std::vector<struct _candidate> positive_candidates ;
	std::vector<struct _candidate> negative_candidates ;
@@ -622,12 +624,15 @@ void Chromap<MappingRecord>::MapPairedEndReads() {
        	index.GenerateCandidates(error_threshold_, minimizers2, &positive_hits2, &negative_hits2, &positive_candidates2, &negative_candidates2);
        uint32_t current_num_candidates2 = positive_candidates2.size() + negative_candidates2.size();
	  
	if (pair_index <  num_loaded_pairs / num_threads_ || num_reads_ < 2 * 5000000 )
	{
		mm_history1[pair_index].minimizers = minimizers1 ;
		mm_history1[pair_index].positive_candidates = positive_candidates1 ;
		mm_history1[pair_index].negative_candidates = negative_candidates1 ;
		mm_history2[pair_index].minimizers = minimizers2 ;
		mm_history2[pair_index].positive_candidates = positive_candidates2 ;
		mm_history2[pair_index].negative_candidates = negative_candidates2 ;
	}

        if (current_num_candidates1 > 0 && current_num_candidates2 > 0) {
          /*positive_candidates1.swap(positive_hits1);
@@ -679,6 +684,8 @@ void Chromap<MappingRecord>::MapPairedEndReads() {
      }
    }
    for (uint32_t pair_index = 0; pair_index < num_loaded_pairs; ++pair_index) {
    	if ( num_reads_ >= 2 * 5000000 && pair_index >= num_loaded_pairs / num_threads_)
		break ;
    	mm_to_candidates_cache.Update(mm_history1[pair_index].minimizers, mm_history1[pair_index].positive_candidates,
				mm_history1[pair_index].negative_candidates) ;
    	mm_to_candidates_cache.Update(mm_history2[pair_index].minimizers, mm_history2[pair_index].positive_candidates,
@@ -1203,7 +1210,7 @@ void Chromap<MappingRecord>::MapSingleEndReads() {
  SequenceBatch read_batch_for_loading(read_batch_size_);
  SequenceBatch barcode_batch(read_batch_size_);
  SequenceBatch barcode_batch_for_loading(read_batch_size_);
  mm_cache mm_to_candidates_cache(1000003) ;
  mm_cache mm_to_candidates_cache(2000007) ;
  mm_to_candidates_cache.SetKmerLength(kmer_size_) ;
  struct _mm_history *mm_history = new struct _mm_history[read_batch_size_];
  read_batch_for_loading.InitializeLoading(read_file1_path_);
@@ -1286,8 +1293,13 @@ void Chromap<MappingRecord>::MapSingleEndReads() {
        negative_candidates.clear();
	if ( mm_to_candidates_cache.Query(minimizers, positive_candidates, negative_candidates, 
		read_batch.GetSequenceLengthAt(read_index) ) == -1)
	{
		index.GenerateCandidates(error_threshold_, minimizers, &positive_hits, &negative_hits, 
					&positive_candidates, &negative_candidates);
		//printf("%d %d %d\n", minimizers.size(), positive_hits.size() + negative_hits.size(), 
		//	positive_candidates.size() + negative_candidates.size()) ;
		//if (positive_hits.size() + negative_hits.size() > minimizers.size() * 100)
	}
	/*else
	{
		printf("successful cache load.%s\n", read_batch.GetSequenceNameAt(read_index)) ;
@@ -1300,10 +1312,12 @@ void Chromap<MappingRecord>::MapSingleEndReads() {
	{
		printf("LI_DEBUG -: %d\n", int(negative_candidates[i].refPos)) ;
	}*/
	if (read_index <  num_loaded_reads / num_threads_ || num_reads_ < 5000000 )
	{
		mm_history[read_index].minimizers = minimizers ;
		mm_history[read_index].positive_candidates = positive_candidates ;
		mm_history[read_index].negative_candidates = negative_candidates ;

	}
	uint32_t current_num_candidates = positive_candidates.size() + negative_candidates.size(); 
        //std::cerr << "Generated candidates!\n";
        if (current_num_candidates > 0) {
@@ -1329,6 +1343,8 @@ void Chromap<MappingRecord>::MapSingleEndReads() {
      }
    }
    for (uint32_t read_index = 0; read_index < num_loaded_reads ; ++read_index) {
    	if ( num_reads_ >= 5000000 && read_index >= num_loaded_reads / num_threads_)
		break ;
    	mm_to_candidates_cache.Update(mm_history[read_index].minimizers, mm_history[read_index].positive_candidates,
				mm_history[read_index].negative_candidates) ;
	if (mm_history[read_index].positive_candidates.size() < mm_history[read_index].positive_candidates.capacity() / 2)
@@ -1336,6 +1352,7 @@ void Chromap<MappingRecord>::MapSingleEndReads() {
	if (mm_history[read_index].negative_candidates.size() < mm_history[read_index].negative_candidates.capacity() / 2)
		std::vector<struct _candidate>().swap(mm_history[read_index].negative_candidates) ;
    }
    //std::cerr<<"cache memusage: " << mm_to_candidates_cache.GetMemoryBytes() <<"\n" ;
#pragma omp taskwait
    num_loaded_reads = num_loaded_reads_for_loading;
    read_batch_for_loading.SwapSequenceBatch(read_batch);
+42 −8
Original line number Diff line number Diff line
@@ -3,6 +3,7 @@

#include "index.h"

#define FINGER_PRINT_SIZE 103

namespace chromap {

@@ -14,6 +15,9 @@ struct _mm_cache_entry
	std::vector<struct _candidate> negative_candidates ;
	
	int weight ;

	unsigned short finger_print_cnt[FINGER_PRINT_SIZE] ;
	int finger_print_cnt_sum ;
} ;

class mm_cache
@@ -22,6 +26,7 @@ private:
	int cache_size ;
	struct _mm_cache_entry *cache ;
	int kmer_length ;
	int update_limit ;
	
	// 0: not match. -1: opposite order. 1: same order
	int IsMinimizersMatchCache(const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, const struct _mm_cache_entry &cache)
@@ -77,6 +82,7 @@ public:
		cache = new struct _mm_cache_entry[size] ;
		cache_size = size ;
		memset(cache, 0, sizeof(cache[0]) * size) ;
		update_limit = 10 ;
	}
	~mm_cache()
	{
@@ -97,7 +103,7 @@ public:
		int msize = minimizers.size() ;
		uint64_t h = 0 ;
		for (i = 0 ; i < msize; ++i)
			h ^= (minimizers[i].first >> 8) ;	
			h += (minimizers[i].first) ;	
		int hidx = h % cache_size ;
		int direction = IsMinimizersMatchCache(minimizers, cache[hidx]) ;
		if (direction == 1)
@@ -141,16 +147,31 @@ public:
		int i ;
		int msize = minimizers.size() ;

		uint64_t h = 0 ;
		uint64_t h = 0 ; // for hash
		uint64_t f = 0 ; // for finger printing
		for (i = 0 ; i < msize; ++i)
			h ^= (minimizers[i].first >> 8) ;	
		{
			h += (minimizers[i].first) ;	
			f ^= (minimizers[i].first) ;
		}
		int hidx = h % cache_size ;
		int finger_print = f % FINGER_PRINT_SIZE ; 
		
		++cache[hidx].finger_print_cnt[finger_print] ;
		++cache[hidx].finger_print_cnt_sum ;

		if (cache[hidx].finger_print_cnt_sum < 10 
			|| (int)cache[hidx].finger_print_cnt[finger_print] * 5 < cache[hidx].finger_print_cnt_sum)
		{
			return ;
		}

		int direction = IsMinimizersMatchCache(minimizers, cache[hidx]) ;
		if (direction != 0)
			++cache[hidx].weight ;
		else
			--cache[hidx].weight ;

		// Renew the cache
		if (cache[hidx].weight < 0 ) 
		{
@@ -169,6 +190,8 @@ public:
			{
				cache[hidx].offsets[i] = ((int)minimizers[i + 1].second>>1) - ((int)minimizers[i].second>>1) ;
			}
			std::vector<struct _candidate>().swap(cache[hidx].positive_candidates) ;
			std::vector<struct _candidate>().swap(cache[hidx].negative_candidates) ;
			cache[hidx].positive_candidates = pos_candidates ;
			cache[hidx].negative_candidates = neg_candidates ;

@@ -184,9 +207,15 @@ public:
		}
	}
	
	int GetMemoryBytes()
	void DirectUpdateWeight(int idx, int weight)
	{
		int i, ret = 0 ;
		cache[idx].weight += weight ;
	}

	uint64_t GetMemoryBytes()
	{
		int i ;
		uint64_t ret = 0 ;
		for (i = 0 ; i < cache_size ; ++i)
		{
			ret += sizeof(cache[i]) + cache[i].minimizers.capacity() * sizeof(uint64_t) 
@@ -196,8 +225,13 @@ public:
		}
		return ret ;
	}
} ;

	void PrintStats()
	{
		for (int i = 0 ; i < cache_size ; ++i)
			printf("%d %d\n", cache[i].weight, cache[i].finger_print_cnt_sum) ;
	}
} ;
} 

#endif