Commit 7c8ae817 authored by Haowen Zhang's avatar Haowen Zhang
Browse files

Merge remote-tracking branch 'origin/li_dev'

Conflicts:
	src/chromap.cc
parents 3aa9cc1c 2dd1b2db
Loading
Loading
Loading
Loading
+175 −44

File changed.

Preview size limit exceeded, changes collapsed.

+5 −9
Original line number Diff line number Diff line
@@ -48,10 +48,6 @@ struct BarcodeWithQual {
  }
};

enum Direction {
  kPositive,
  kNegative,
};

#define SortMappingWithoutBarcode(m) (((((m).fragment_start_position<<16)|(m).fragment_length)<<8)|(m).mapq)
//#define SortMappingWithoutBarcode(m) (m)
@@ -116,8 +112,8 @@ class Chromap {
  uint32_t LoadPairedEndReadsWithBarcodes(SequenceBatch *read_batch1, SequenceBatch *read_batch2, SequenceBatch *barcode_batch);
  void TrimAdapterForPairedEndRead(uint32_t pair_index, SequenceBatch *read_batch1, SequenceBatch *read_batch2);
  bool PairedEndReadWithBarcodeIsDuplicate(uint32_t pair_index, const SequenceBatch &barcode_batch, const SequenceBatch &read_batch1, const SequenceBatch &read_batch2);
  void ReduceCandidatesForPairedEndReadOnOneDirection(const std::vector<uint64_t> &candidates1, const std::vector<uint64_t> &candidates2, std::vector<uint64_t> *filtered_candidates1, std::vector<uint64_t> *filtered_candidates2);
  void ReduceCandidatesForPairedEndRead(const std::vector<uint64_t> &positive_candidates1, const std::vector<uint64_t> &negative_candidates1, const std::vector<uint64_t> &positive_candidates2, const std::vector<uint64_t> &negative_candidates2, std::vector<uint64_t> *filtered_positive_candidates1, std::vector<uint64_t> *filtered_negative_candidates1, std::vector<uint64_t> *filtered_positive_candidates2, std::vector<uint64_t> *filtered_negative_candidates2);
  void ReduceCandidatesForPairedEndReadOnOneDirection(const std::vector<struct _candidate> &candidates1, const std::vector<struct _candidate> &candidates2, std::vector<struct _candidate> *filtered_candidates1, std::vector<struct _candidate> *filtered_candidates2);
  void ReduceCandidatesForPairedEndRead(const std::vector<struct _candidate> &positive_candidates1, const std::vector<struct _candidate> &negative_candidates1, const std::vector<struct _candidate> &positive_candidates2, const std::vector<struct _candidate> &negative_candidates2, std::vector<struct _candidate> *filtered_positive_candidates1, std::vector<struct _candidate> *filtered_negative_candidates1, std::vector<struct _candidate> *filtered_positive_candidates2, std::vector<struct _candidate> *filtered_negative_candidates2);
  void GenerateBestMappingsForPairedEndReadOnOneDirection(Direction first_read_direction, uint32_t pair_index, int num_candidates1, int min_num_errors1, int num_best_mappings1, int second_min_num_errors1, int num_second_best_mappings1, const SequenceBatch &read_batch1, const std::vector<std::pair<int, uint64_t> > &mappings1, int num_candidates2, int min_num_errors2, int num_best_mappings2, int second_min_num_errors2, int num_second_best_mappings2, const SequenceBatch &read_batch2, const SequenceBatch &reference, const std::vector<std::pair<int, uint64_t> > &mappings2, std::vector<std::pair<uint32_t, uint32_t> > *best_mappings, int *min_sum_errors, int *num_best_mappings, int *second_min_sum_errors, int *num_second_best_mappings);
  void RecalibrateBestMappingsForPairedEndReadOnOneDirection(Direction first_read_direction, uint32_t pair_index, int min_sum_errors, int second_min_sum_errors, int min_num_errors1, int num_best_mappings1, int second_min_num_errors1, int num_second_best_mappings1, const SequenceBatch &read_batch1, const std::vector<std::pair<int, uint64_t> > &mappings1, int min_num_errors2, int num_best_mappings2, int second_min_num_errors2, int num_second_best_mappings2, const SequenceBatch &read_batch2, const SequenceBatch &reference, const std::vector<std::pair<int, uint64_t> > &mappings2, const std::vector<std::pair<uint32_t, uint32_t> > &edit_best_mappings, std::vector<std::pair<uint32_t, uint32_t> > *best_mappings, int *best_alignment_score, int *num_best_mappings, int *second_best_alignment_score, int *num_second_best_mappings);
  void ProcessBestMappingsForPairedEndReadOnOneDirection(Direction first_read_direction, uint32_t pair_index, uint8_t mapq, int num_candidates1, int min_num_errors1, int num_best_mappings1, int second_min_num_errors1, int num_second_best_mappings1, const SequenceBatch &read_batch1, const std::vector<std::pair<int, uint64_t> > &mappings1, int num_candidates2, int min_num_errors2, int num_best_mappings2, int second_min_num_errors2, int num_second_best_mappings2, const SequenceBatch &read_batch2, const SequenceBatch &reference, const SequenceBatch &barcode_batch, const std::vector<int> &best_mapping_indices, const std::vector<std::pair<int, uint64_t> > &mappings2, const std::vector<std::pair<uint32_t, uint32_t> > &best_mappings, int min_sum_errors, int num_best_mappings, int second_min_sum_errors, int num_second_best_mappings, int *best_mapping_index, int *num_best_mappings_reported, std::vector<std::vector<MappingRecord> > *mappings_on_diff_ref_seqs);
@@ -141,9 +137,9 @@ class Chromap {
  void BandedAlign4PatternsToText(const char **patterns, const char *text, int read_length, int32_t *mapping_edit_distances, int32_t *mapping_end_positions);
  void BandedAlign8PatternsToText(const char **patterns, const char *text, int read_length, int16_t *mapping_edit_distances, int16_t *mapping_end_positions);
  void BandedTraceback(int min_num_errors, const char *pattern, const char *text, const int read_length, int *mapping_start_position);
  void VerifyCandidatesOnOneDirectionUsingSIMD(Direction candidate_direction, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<uint64_t> &candidates, std::vector<std::pair<int, uint64_t> > *mappings, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void VerifyCandidatesOnOneDirection(Direction candidate_direction, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<uint64_t> &candidates, std::vector<std::pair<int, uint64_t> > *mappings, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void VerifyCandidates(const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<uint64_t> &positive_candidates, const std::vector<uint64_t> &negative_candidates, std::vector<std::pair<int, uint64_t> > *positive_mappings, std::vector<std::pair<int, uint64_t> > *negative_mappings, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void VerifyCandidatesOnOneDirectionUsingSIMD(Direction candidate_direction, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<struct _candidate> &candidates, std::vector<std::pair<int, uint64_t> > *mappings, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void VerifyCandidatesOnOneDirection(Direction candidate_direction, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<struct _candidate> &candidates, std::vector<std::pair<int, uint64_t> > *mappings, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void VerifyCandidates(const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, const std::vector<struct _candidate> &positive_candidates, const std::vector<struct _candidate> &negative_candidates, std::vector<std::pair<int, uint64_t> > *positive_mappings, std::vector<std::pair<int, uint64_t> > *negative_mappings, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void AllocateMultiMappings(uint32_t num_reference_sequences);
  void RemovePCRDuplicate(uint32_t num_reference_sequences);
  uint32_t MoveMappingsInBuffersToMappingContainer(uint32_t num_reference_sequences, std::vector<std::vector<std::vector<MappingRecord> > > *mappings_on_diff_ref_seqs_for_diff_threads_for_saving);
+19 −4
Original line number Diff line number Diff line
@@ -7,6 +7,7 @@
#include "chromap.h"

namespace chromap {

void Index::Statistics(uint32_t num_sequences, const SequenceBatch &reference) {
  double real_start_time = Chromap<>::GetRealTime();
  int n = 0, n1 = 0;
@@ -261,7 +262,7 @@ void Index::Load() {
  std::cerr << "Loaded index successfully in "<< Chromap<>::GetRealTime() - real_start_time << "s.\n";
}

void Index::GenerateCandidatesOnOneDirection(int error_threshold, std::vector<uint64_t> *hits, std::vector<uint64_t> *candidates) {
void Index::GenerateCandidatesOnOneDirection(int error_threshold, std::vector<uint64_t> *hits, std::vector<struct _candidate> *candidates) {
  hits->emplace_back(UINT64_MAX);
  if (hits->size() > 0) {
    std::sort(hits->begin(), hits->end());
@@ -274,7 +275,10 @@ void Index::GenerateCandidatesOnOneDirection(int error_threshold, std::vector<ui
      uint32_t current_reference_position = (*hits)[pi];
      if (current_reference_id != previous_reference_id || current_reference_position > previous_reference_position + error_threshold) {
        if (count >= min_num_seeds_required_for_mapping_) {
          candidates->push_back(previous_hit);
	  struct _candidate nc ;
	  nc.refPos = previous_hit ;
	  nc.mmCnt = count ;
          candidates->push_back(nc);
	}
        count = 1;
      } else {
@@ -338,16 +342,19 @@ void Index::CollectCandidates(int max_seed_frequency, const std::vector<std::pai
  }
}

void Index::GenerateCandidates(int error_threshold, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits, std::vector<uint64_t> *positive_candidates, std::vector<uint64_t> *negative_candidates) {
void Index::GenerateCandidates(int error_threshold, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits, std::vector<struct _candidate> *positive_candidates, std::vector<struct _candidate> *negative_candidates) {
  CollectCandidates(max_seed_frequencies_[0], minimizers, positive_hits, negative_hits);
  // Now I can generate primer chain in candidates
  // Let me use sort for now, but I can use merge later.
  //printf("p+n: %d\n", positive_hits->size() + negative_hits->size()) ;
  GenerateCandidatesOnOneDirection(error_threshold, positive_hits, positive_candidates);
  GenerateCandidatesOnOneDirection(error_threshold, negative_hits, negative_candidates);
  if (positive_candidates->size() + negative_candidates->size() == 0) {
    positive_hits->clear();
    negative_hits->clear();
    //printf("second round\n") ;
    CollectCandidates(max_seed_frequencies_[1], minimizers, positive_hits, negative_hits);
    //printf("p+n2: %d\n", positive_hits->size() + negative_hits->size()) ;
    GenerateCandidatesOnOneDirection(error_threshold, positive_hits, positive_candidates);
    GenerateCandidatesOnOneDirection(error_threshold, negative_hits, negative_candidates);
    // TODO: if necessary, we can further improve the rescue. But the code below is not thread safe. We can think about this later
@@ -358,5 +365,13 @@ void Index::GenerateCandidates(int error_threshold, const std::vector<std::pair<
//      GenerateCandidatesOnOneDirection(negative_hits, negative_candidates);
//    }
  }
  uint32_t i ;
  uint32_t size = positive_candidates->size() ;
  for (i = 0 ; i < size ; ++i)
  	(*positive_candidates)[i].direction = kPositive ;
  size = negative_candidates->size() ;
  for (i = 0 ; i < size ; ++i)
  	(*negative_candidates)[i].direction = kNegative ;
  //printf("p+n_candidates: %d\n", positive_candidates->size() + negative_candidates->size()) ;
}
} // namespace chromap
+22 −2
Original line number Diff line number Diff line
@@ -12,6 +12,26 @@ namespace chromap {
#define KHashEqForIndex(a, b) ((a)>>1 == (b)>>1)
KHASH_INIT(k64, uint64_t, uint64_t, 1, KHashFunctionForIndex, KHashEqForIndex);

enum Direction {
  kPositive,
  kNegative,
};

struct _candidate
{
	uint64_t refPos ; // position on the ref genome.
	uint32_t mmCnt ; // The number of minimizers in this candidate
	Direction direction ; 

	bool operator<(const struct _candidate &c) const
	{
		if (mmCnt != c.mmCnt)		
			return mmCnt > c.mmCnt ;
		else
			return refPos < c.refPos ;
	}
} ;

class Index {
 public:
  Index(int min_num_seeds_required_for_mapping, const std::vector<int> &max_seed_frequencies, const std::string &index_file_path) : min_num_seeds_required_for_mapping_(min_num_seeds_required_for_mapping), max_seed_frequencies_(max_seed_frequencies), index_file_path_(index_file_path) { // for read mapping
@@ -51,8 +71,8 @@ class Index {
  void Construct(uint32_t num_sequences, const SequenceBatch &reference);
  void Save();
  void Load();
  void GenerateCandidatesOnOneDirection(int error_threshold, std::vector<uint64_t> *hits, std::vector<uint64_t> *candidates);
  void GenerateCandidates(int error_threshold, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits, std::vector<uint64_t> *positive_candidates, std::vector<uint64_t> *negative_candidates);
  void GenerateCandidatesOnOneDirection(int error_threshold, std::vector<uint64_t> *hits, std::vector<struct _candidate> *candidates);
  void GenerateCandidates(int error_threshold, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits, std::vector<struct _candidate> *positive_candidates, std::vector<struct _candidate> *negative_candidates);
  void CollectCandidates(int max_seed_frequency, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits);
  inline static uint64_t Hash64(uint64_t key, const uint64_t mask) {
    key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1;

src/mmcache.hpp

0 → 100644
+203 −0
Original line number Diff line number Diff line
#ifndef CHROMAP_CACHE_H_
#define CHROMAP_CACHE_H_

#include "index.h"


namespace chromap {

struct _mm_cache_entry
{
	std::vector<uint64_t> minimizers ;
	std::vector<int> offsets ; // the distance to the next minimizer
	std::vector<struct _candidate> positive_candidates ;
	std::vector<struct _candidate> negative_candidates ;

	int weight ;
} ;

class mm_cache
{
private:
	int cache_size ;
	struct _mm_cache_entry *cache ;
	int kmer_length ;
	
	// 0: not match. -1: opposite order. 1: same order
	int IsMinimizersMatchCache(const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, const struct _mm_cache_entry &cache)
	{
		if (cache.minimizers.size() != minimizers.size())
			return 0 ;
		int size = minimizers.size() ;
		int i, j ;
		int direction = 0 ;
		for (i = 0 ; i < size ; ++i)
		{
			if (cache.minimizers[i] != minimizers[i].first)
				break ;
		}
		if (i >= size)
		{
			for (i = 0 ; i < size - 1 ; ++i)	
			{
				if (cache.offsets[i] != ((int)minimizers[i + 1].second>>1) - ((int)minimizers[i].second>>1))
					break ;
			}
			if (i >= size - 1)
				direction = 1 ;
		}
		
		if (direction == 1)
			return 1 ;
		
		for (i = 0, j = size - 1; i < size; ++i, --j)
		{
			if (cache.minimizers[i] != minimizers[j].first)
				break ;
		}
		if (i >= size)
		{
			for (i = 0, j = size - 1; i < size - 1; ++i, --j)
			{
				if (cache.offsets[i] != ((int)minimizers[j].second>>1) - ((int)minimizers[j - 1].second>>1))
					break ;
			}

			if (i >= size - 1)
			{
				direction = -1 ;
			}
		}

		return direction ;
	}
public:
	mm_cache(int size) 
	{
		cache = new struct _mm_cache_entry[size] ;
		cache_size = size ;
		memset(cache, 0, sizeof(cache[0]) * size) ;
	}
	~mm_cache()
	{
		delete[] cache ;
	}
	
	void SetKmerLength(int kl)
	{
		kmer_length = kl ;
	}

	// Return the hash entry index. -1 if failed.
	int Query(const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, 
			std::vector<struct _candidate> &pos_candidates, std::vector<struct _candidate> &neg_candidates, 
			uint32_t read_len)
	{
		int i ;
		int msize = minimizers.size() ;
		uint64_t h = 0 ;
		for (i = 0 ; i < msize; ++i)
			h ^= (minimizers[i].first >> 8) ;	
		int hidx = h % cache_size ;
		int direction = IsMinimizersMatchCache(minimizers, cache[hidx]) ;
		if (direction == 1)
		{
			pos_candidates = cache[hidx].positive_candidates ;
			neg_candidates = cache[hidx].negative_candidates ;
			int size = pos_candidates.size() ;
			int shift = (int)minimizers[0].second>>1 ;
			for (i = 0 ; i < size ; ++i)
				pos_candidates[i].refPos -= shift ;
			size = neg_candidates.size() ;
			for (i = 0 ; i < size ; ++i)
				neg_candidates[i].refPos += shift ;
			return hidx ;
		}
		else if (direction == -1) // The "read" is on the other direction of the cached "read"
		{
			int size = cache[hidx].negative_candidates.size() ;
			// Start position of the last minimizer shoud equal the first minimizer's end position in rc "read".
			int shift = read_len - ((int)minimizers[msize - 1].second>>1) - 1 + kmer_length - 1 ; 
			
			pos_candidates = cache[hidx].negative_candidates ;
			for (i = 0 ; i < size ; ++i)
				pos_candidates[i].refPos = cache[hidx].negative_candidates[i].refPos + shift - read_len + 1 ;

			size = cache[hidx].positive_candidates.size() ;
			neg_candidates = cache[hidx].positive_candidates ;
			for (i = 0 ; i < size ; ++i)
				neg_candidates[i].refPos = cache[hidx].positive_candidates[i].refPos - shift + read_len - 1 ;
			return hidx ;
		}
		else
		{
			return -1 ;
		}
	}

	void Update(const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, 
			std::vector<struct _candidate> &pos_candidates, std::vector<struct _candidate> &neg_candidates )
	{
		int i ;
		int msize = minimizers.size() ;

		uint64_t h = 0 ;
		for (i = 0 ; i < msize; ++i)
			h ^= (minimizers[i].first >> 8) ;	
		int hidx = h % cache_size ;

		int direction = IsMinimizersMatchCache(minimizers, cache[hidx]) ;
		if (direction != 0)
			++cache[hidx].weight ;
		else
			--cache[hidx].weight ;
		// Renew the cache
		if (cache[hidx].weight < 0)
		{
			cache[hidx].weight = 1 ;
			cache[hidx].minimizers.resize(msize) ;
			if (msize == 0)
			{
				cache[hidx].offsets.resize(0) ;
				return ;
			}

			cache[hidx].offsets.resize(msize - 1) ;
			for (i = 0 ; i < msize ; ++i)
				cache[hidx].minimizers[i] = minimizers[i].first ;
			for (i = 0 ; i < msize - 1; ++i)
			{
				cache[hidx].offsets[i] = ((int)minimizers[i + 1].second>>1) - ((int)minimizers[i].second>>1) ;
			}
			cache[hidx].positive_candidates = pos_candidates ;
			cache[hidx].negative_candidates = neg_candidates ;

			// adjust the candidate position.
			int size = cache[hidx].positive_candidates.size() ;
			int shift = (int)minimizers[0].second>>1;
			for (i = 0 ; i < size ; ++i)
				cache[hidx].positive_candidates[i].refPos += shift ;
			size = cache[hidx].negative_candidates.size() ;
			for (i = 0 ; i < size ; ++i)
				cache[hidx].negative_candidates[i].refPos -= shift ;
			
		}
	}
	
	int GetMemoryBytes()
	{
		int i, ret = 0 ;
		for (i = 0 ; i < cache_size ; ++i)
		{
			ret += sizeof(cache[i]) + cache[i].minimizers.capacity() * sizeof(uint64_t) 
				+ cache[i].offsets.capacity() * sizeof(int)
				+ cache[i].positive_candidates.capacity() * sizeof(struct _candidate) 
				+ cache[i].negative_candidates.capacity() * sizeof(struct _candidate) ;
		}
		return ret ;
	}
} ;

} 

#endif