Commit d5b33c68 authored by Haowen Zhang's avatar Haowen Zhang
Browse files

Refactor code from Li

parent edf24f80
Loading
Loading
Loading
Loading
+150 −155

File changed.

Preview size limit exceeded, changes collapsed.

+9 −5
Original line number Diff line number Diff line
@@ -48,6 +48,10 @@ struct BarcodeWithQual {
  }
};

enum Direction {
  kPositive,
  kNegative,
};

#define SortMappingWithoutBarcode(m) (((((m).fragment_start_position<<16)|(m).fragment_length)<<8)|(m).mapq)
//#define SortMappingWithoutBarcode(m) (m)
@@ -112,8 +116,8 @@ class Chromap {
  uint32_t LoadPairedEndReadsWithBarcodes(SequenceBatch *read_batch1, SequenceBatch *read_batch2, SequenceBatch *barcode_batch);
  void TrimAdapterForPairedEndRead(uint32_t pair_index, SequenceBatch *read_batch1, SequenceBatch *read_batch2);
  bool PairedEndReadWithBarcodeIsDuplicate(uint32_t pair_index, const SequenceBatch &barcode_batch, const SequenceBatch &read_batch1, const SequenceBatch &read_batch2);
  void ReduceCandidatesForPairedEndReadOnOneDirection(const std::vector<struct _candidate> &candidates1, const std::vector<struct _candidate> &candidates2, std::vector<struct _candidate> *filtered_candidates1, std::vector<struct _candidate> *filtered_candidates2);
  void ReduceCandidatesForPairedEndRead(const std::vector<struct _candidate> &positive_candidates1, const std::vector<struct _candidate> &negative_candidates1, const std::vector<struct _candidate> &positive_candidates2, const std::vector<struct _candidate> &negative_candidates2, std::vector<struct _candidate> *filtered_positive_candidates1, std::vector<struct _candidate> *filtered_negative_candidates1, std::vector<struct _candidate> *filtered_positive_candidates2, std::vector<struct _candidate> *filtered_negative_candidates2);
  void ReduceCandidatesForPairedEndReadOnOneDirection(const std::vector<Candidate> &candidates1, const std::vector<Candidate> &candidates2, std::vector<Candidate> *filtered_candidates1, std::vector<Candidate> *filtered_candidates2);
  void ReduceCandidatesForPairedEndRead(const std::vector<Candidate> &positive_candidates1, const std::vector<Candidate> &negative_candidates1, const std::vector<Candidate> &positive_candidates2, const std::vector<Candidate> &negative_candidates2, std::vector<Candidate> *filtered_positive_candidates1, std::vector<Candidate> *filtered_negative_candidates1, std::vector<Candidate> *filtered_positive_candidates2, std::vector<Candidate> *filtered_negative_candidates2);
  void GenerateBestMappingsForPairedEndReadOnOneDirection(Direction first_read_direction, uint32_t pair_index, int num_candidates1, int min_num_errors1, int num_best_mappings1, int second_min_num_errors1, int num_second_best_mappings1, const SequenceBatch &read_batch1, const std::vector<std::pair<int, uint64_t> > &mappings1, int num_candidates2, int min_num_errors2, int num_best_mappings2, int second_min_num_errors2, int num_second_best_mappings2, const SequenceBatch &read_batch2, const SequenceBatch &reference, const std::vector<std::pair<int, uint64_t> > &mappings2, std::vector<std::pair<uint32_t, uint32_t> > *best_mappings, int *min_sum_errors, int *num_best_mappings, int *second_min_sum_errors, int *num_second_best_mappings);
  void RecalibrateBestMappingsForPairedEndReadOnOneDirection(Direction first_read_direction, uint32_t pair_index, int min_sum_errors, int second_min_sum_errors, int min_num_errors1, int num_best_mappings1, int second_min_num_errors1, int num_second_best_mappings1, const SequenceBatch &read_batch1, const std::vector<std::pair<int, uint64_t> > &mappings1, int min_num_errors2, int num_best_mappings2, int second_min_num_errors2, int num_second_best_mappings2, const SequenceBatch &read_batch2, const SequenceBatch &reference, const std::vector<std::pair<int, uint64_t> > &mappings2, const std::vector<std::pair<uint32_t, uint32_t> > &edit_best_mappings, std::vector<std::pair<uint32_t, uint32_t> > *best_mappings, int *best_alignment_score, int *num_best_mappings, int *second_best_alignment_score, int *num_second_best_mappings);
  void ProcessBestMappingsForPairedEndReadOnOneDirection(Direction first_read_direction, uint32_t pair_index, uint8_t mapq, int num_candidates1, int min_num_errors1, int num_best_mappings1, int second_min_num_errors1, int num_second_best_mappings1, const SequenceBatch &read_batch1, const std::vector<std::pair<int, uint64_t> > &mappings1, int num_candidates2, int min_num_errors2, int num_best_mappings2, int second_min_num_errors2, int num_second_best_mappings2, const SequenceBatch &read_batch2, const SequenceBatch &reference, const SequenceBatch &barcode_batch, const std::vector<int> &best_mapping_indices, const std::vector<std::pair<int, uint64_t> > &mappings2, const std::vector<std::pair<uint32_t, uint32_t> > &best_mappings, int min_sum_errors, int num_best_mappings, int second_min_sum_errors, int num_second_best_mappings, int *best_mapping_index, int *num_best_mappings_reported, std::vector<std::vector<MappingRecord> > *mappings_on_diff_ref_seqs);
@@ -137,9 +141,9 @@ class Chromap {
  void BandedAlign4PatternsToText(const char **patterns, const char *text, int read_length, int32_t *mapping_edit_distances, int32_t *mapping_end_positions);
  void BandedAlign8PatternsToText(const char **patterns, const char *text, int read_length, int16_t *mapping_edit_distances, int16_t *mapping_end_positions);
  void BandedTraceback(int min_num_errors, const char *pattern, const char *text, const int read_length, int *mapping_start_position);
  void VerifyCandidatesOnOneDirectionUsingSIMD(Direction candidate_direction, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<struct _candidate> &candidates, std::vector<std::pair<int, uint64_t> > *mappings, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void VerifyCandidatesOnOneDirection(Direction candidate_direction, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<struct _candidate> &candidates, std::vector<std::pair<int, uint64_t> > *mappings, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void VerifyCandidates(const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, const std::vector<struct _candidate> &positive_candidates, const std::vector<struct _candidate> &negative_candidates, std::vector<std::pair<int, uint64_t> > *positive_mappings, std::vector<std::pair<int, uint64_t> > *negative_mappings, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void VerifyCandidatesOnOneDirectionUsingSIMD(Direction candidate_direction, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<Candidate> &candidates, std::vector<std::pair<int, uint64_t> > *mappings, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void VerifyCandidatesOnOneDirection(Direction candidate_direction, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<Candidate> &candidates, std::vector<std::pair<int, uint64_t> > *mappings, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void VerifyCandidates(const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, const std::vector<Candidate> &positive_candidates, const std::vector<Candidate> &negative_candidates, std::vector<std::pair<int, uint64_t> > *positive_mappings, std::vector<std::pair<int, uint64_t> > *negative_mappings, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void AllocateMultiMappings(uint32_t num_reference_sequences);
  void RemovePCRDuplicate(uint32_t num_reference_sequences);
  uint32_t MoveMappingsInBuffersToMappingContainer(uint32_t num_reference_sequences, std::vector<std::vector<std::vector<MappingRecord> > > *mappings_on_diff_ref_seqs_for_diff_threads_for_saving);
+13 −13
Original line number Diff line number Diff line
@@ -262,7 +262,7 @@ void Index::Load() {
  std::cerr << "Loaded index successfully in "<< Chromap<>::GetRealTime() - real_start_time << "s.\n";
}

void Index::GenerateCandidatesOnOneDirection(int error_threshold, std::vector<uint64_t> *hits, std::vector<struct _candidate> *candidates) {
void Index::GenerateCandidatesOnOneDirection(int error_threshold, std::vector<uint64_t> *hits, std::vector<Candidate> *candidates) {
  hits->emplace_back(UINT64_MAX);
  if (hits->size() > 0) {
    std::sort(hits->begin(), hits->end());
@@ -275,9 +275,9 @@ void Index::GenerateCandidatesOnOneDirection(int error_threshold, std::vector<ui
      uint32_t current_reference_position = (*hits)[pi];
      if (current_reference_id != previous_reference_id || current_reference_position > previous_reference_position + error_threshold) {
        if (count >= min_num_seeds_required_for_mapping_) {
	  struct _candidate nc ;
	  nc.refPos = previous_hit ;
	  nc.mmCnt = count ;
          Candidate nc;
          nc.position = previous_hit;
          nc.count = count;
          candidates->push_back(nc);
        }
        count = 1;
@@ -342,7 +342,7 @@ void Index::CollectCandidates(int max_seed_frequency, const std::vector<std::pai
  }
}

void Index::GenerateCandidates(int error_threshold, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits, std::vector<struct _candidate> *positive_candidates, std::vector<struct _candidate> *negative_candidates) {
void Index::GenerateCandidates(int error_threshold, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits, std::vector<Candidate> *positive_candidates, std::vector<Candidate> *negative_candidates) {
  CollectCandidates(max_seed_frequencies_[0], minimizers, positive_hits, negative_hits);
  // Now I can generate primer chain in candidates
  // Let me use sort for now, but I can use merge later.
@@ -365,13 +365,13 @@ void Index::GenerateCandidates(int error_threshold, const std::vector<std::pair<
//      GenerateCandidatesOnOneDirection(negative_hits, negative_candidates);
//    }
  }
  uint32_t i ;
  uint32_t size = positive_candidates->size() ;
  for (i = 0 ; i < size ; ++i)
  	(*positive_candidates)[i].direction = kPositive ;
  size = negative_candidates->size() ;
  for (i = 0 ; i < size ; ++i)
  	(*negative_candidates)[i].direction = kNegative ;
  //uint32_t i ;
  //uint32_t size = positive_candidates->size() ;
  //for (i = 0 ; i < size ; ++i)
  //	(*positive_candidates)[i].direction = kPositive ;
  //size = negative_candidates->size() ;
  //for (i = 0 ; i < size ; ++i)
  //	(*negative_candidates)[i].direction = kNegative ;
  //printf("p+n_candidates: %d\n", positive_candidates->size() + negative_candidates->size()) ;
}
} // namespace chromap
+11 −20
Original line number Diff line number Diff line
@@ -12,23 +12,14 @@ namespace chromap {
#define KHashEqForIndex(a, b) ((a)>>1 == (b)>>1)
KHASH_INIT(k64, uint64_t, uint64_t, 1, KHashFunctionForIndex, KHashEqForIndex);

enum Direction {
  kPositive,
  kNegative,
};

struct _candidate
{
	uint64_t refPos ; // position on the ref genome.
	uint32_t mmCnt ; // The number of minimizers in this candidate
	Direction direction ; 

	bool operator<(const struct _candidate &c) const
	{
		if (mmCnt != c.mmCnt)		
			return mmCnt > c.mmCnt ;
struct Candidate {
  uint64_t position;
  uint8_t count;
  bool operator<(const Candidate &c) const {
    if (count != c.count)		
      return count > c.count;
    else
			return refPos < c.refPos ;
      return position < c.position;
  }
};

@@ -71,8 +62,8 @@ class Index {
  void Construct(uint32_t num_sequences, const SequenceBatch &reference);
  void Save();
  void Load();
  void GenerateCandidatesOnOneDirection(int error_threshold, std::vector<uint64_t> *hits, std::vector<struct _candidate> *candidates);
  void GenerateCandidates(int error_threshold, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits, std::vector<struct _candidate> *positive_candidates, std::vector<struct _candidate> *negative_candidates);
  void GenerateCandidatesOnOneDirection(int error_threshold, std::vector<uint64_t> *hits, std::vector<Candidate> *candidates);
  void GenerateCandidates(int error_threshold, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits, std::vector<Candidate> *positive_candidates, std::vector<Candidate> *negative_candidates);
  void CollectCandidates(int max_seed_frequency, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits);
  inline static uint64_t Hash64(uint64_t key, const uint64_t mask) {
    key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1;
+196 −227
Original line number Diff line number Diff line
@@ -6,22 +6,17 @@
#define FINGER_PRINT_SIZE 103

namespace chromap {

struct _mm_cache_entry
{
struct _mm_cache_entry {
  std::vector<uint64_t> minimizers;
  std::vector<int> offsets; // the distance to the next minimizer
	std::vector<struct _candidate> positive_candidates ;
	std::vector<struct _candidate> negative_candidates ;
	
  std::vector<Candidate> positive_candidates;
  std::vector<Candidate> negative_candidates;
  int weight;

  unsigned short finger_print_cnt[FINGER_PRINT_SIZE];
  int finger_print_cnt_sum;
};

class mm_cache
{
class mm_cache {
 private:
  int cache_size;
  struct _mm_cache_entry *cache;
@@ -29,22 +24,18 @@ private:
  int update_limit;

  // 0: not match. -1: opposite order. 1: same order
	int IsMinimizersMatchCache(const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, const struct _mm_cache_entry &cache)
	{
  int IsMinimizersMatchCache(const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, const struct _mm_cache_entry &cache) {
    if (cache.minimizers.size() != minimizers.size())
      return 0;
    int size = minimizers.size();
    int i, j;
    int direction = 0;
		for (i = 0 ; i < size ; ++i)
		{
    for (i = 0; i < size; ++i) {
      if (cache.minimizers[i] != minimizers[i].first)
        break;
    }
		if (i >= size)
		{
			for (i = 0 ; i < size - 1 ; ++i)	
			{
    if (i >= size) {
      for (i = 0; i < size - 1; ++i)	{
        if (cache.offsets[i] != ((int)minimizers[i + 1].second>>1) - ((int)minimizers[i].second>>1))
          break;
      }
@@ -55,50 +46,45 @@ private:
    if (direction == 1)
      return 1;

		for (i = 0, j = size - 1; i < size; ++i, --j)
		{
    for (i = 0, j = size - 1; i < size; ++i, --j) {
      if (cache.minimizers[i] != minimizers[j].first)
        break;
    }
		if (i >= size)
		{
			for (i = 0, j = size - 1; i < size - 1; ++i, --j)
			{
    if (i >= size) {
      for (i = 0, j = size - 1; i < size - 1; ++i, --j) {
        if (cache.offsets[i] != ((int)minimizers[j].second>>1) - ((int)minimizers[j - 1].second>>1))
          break;
      }

			if (i >= size - 1)
			{
      if (i >= size - 1) {
        direction = -1;
      }
    }

    return direction;
  }

 public:
	mm_cache(int size) 
	{
  mm_cache(int size) {
    cache = new struct _mm_cache_entry[size];
    cache_size = size;
		memset(cache, 0, sizeof(cache[0]) * size) ;
    //memset(cache, 0, sizeof(cache[0]) * size) ;
    for (int i = 0; i < size; ++i) {
      cache[i].weight = 0;
      memset(cache[i].finger_print_cnt, 0, sizeof(unsigned short) * FINGER_PRINT_SIZE);
      cache[i].finger_print_cnt_sum = 0;
    }
    update_limit = 10;
  }
	~mm_cache()
	{
  ~mm_cache() {
    delete[] cache;
  }

	void SetKmerLength(int kl)
	{
  void SetKmerLength(int kl) {
    kmer_length = kl;
  }

  // Return the hash entry index. -1 if failed.
	int Query(const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, 
			std::vector<struct _candidate> &pos_candidates, std::vector<struct _candidate> &neg_candidates, 
			uint32_t read_len)
	{
  int Query(const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, std::vector<Candidate> &pos_candidates, std::vector<Candidate> &neg_candidates, uint32_t read_len) {
    int i;
    int msize = minimizers.size();
    uint64_t h = 0;
@@ -106,51 +92,44 @@ public:
      h += (minimizers[i].first);
    int hidx = h % cache_size;
    int direction = IsMinimizersMatchCache(minimizers, cache[hidx]);
		if (direction == 1)
		{
    if (direction == 1) {
      pos_candidates = cache[hidx].positive_candidates;
      neg_candidates = cache[hidx].negative_candidates;
      int size = pos_candidates.size();
      int shift = (int)minimizers[0].second >> 1;
      for (i = 0; i < size; ++i)
				pos_candidates[i].refPos -= shift ;
        pos_candidates[i].position -= shift;
      size = neg_candidates.size();
      for (i = 0; i < size; ++i)
				neg_candidates[i].refPos += shift ;
        neg_candidates[i].position += shift;
      return hidx;
    }
		else if (direction == -1) // The "read" is on the other direction of the cached "read"
		{
    else if (direction == -1) {// The "read" is on the other direction of the cached "read"
      int size = cache[hidx].negative_candidates.size();
      // Start position of the last minimizer shoud equal the first minimizer's end position in rc "read".
      int shift = read_len - ((int)minimizers[msize - 1].second>>1) - 1 + kmer_length - 1; 

      pos_candidates = cache[hidx].negative_candidates;
      for (i = 0; i < size; ++i)
				pos_candidates[i].refPos = cache[hidx].negative_candidates[i].refPos + shift - read_len + 1 ;
        pos_candidates[i].position = cache[hidx].negative_candidates[i].position + shift - read_len + 1;

      size = cache[hidx].positive_candidates.size();
      neg_candidates = cache[hidx].positive_candidates;
      for (i = 0; i < size; ++i)
				neg_candidates[i].refPos = cache[hidx].positive_candidates[i].refPos - shift + read_len - 1 ;
        neg_candidates[i].position = cache[hidx].positive_candidates[i].position - shift + read_len - 1;
      return hidx;
		}
		else
		{
    } else {
      return -1;
    }
  }

	void Update(const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, 
			std::vector<struct _candidate> &pos_candidates, std::vector<struct _candidate> &neg_candidates )
	{
  void Update(const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, std::vector<Candidate> &pos_candidates, std::vector<Candidate> &neg_candidates) {
    int i;
    int msize = minimizers.size();

    uint64_t h = 0; // for hash
    uint64_t f = 0; // for finger printing
		for (i = 0 ; i < msize; ++i)
		{
    for (i = 0; i < msize; ++i) {
      h += (minimizers[i].first);	
      f ^= (minimizers[i].first);
    }
@@ -160,9 +139,7 @@ public:
    ++cache[hidx].finger_print_cnt[finger_print];
    ++cache[hidx].finger_print_cnt_sum;

		if (cache[hidx].finger_print_cnt_sum < 10 
			|| (int)cache[hidx].finger_print_cnt[finger_print] * 5 < cache[hidx].finger_print_cnt_sum)
		{
    if (cache[hidx].finger_print_cnt_sum < 10 || (int)cache[hidx].finger_print_cnt[finger_print] * 5 < cache[hidx].finger_print_cnt_sum) {
      return;
    }

@@ -173,12 +150,10 @@ public:
      --cache[hidx].weight;

    // Renew the cache
		if (cache[hidx].weight < 0 ) 
		{
    if (cache[hidx].weight < 0) {
      cache[hidx].weight = 1;
      cache[hidx].minimizers.resize(msize);
			if (msize == 0)
			{
      if (msize == 0) {
        cache[hidx].offsets.resize(0);
        return;
      }
@@ -186,12 +161,11 @@ public:
      cache[hidx].offsets.resize(msize - 1);
      for (i = 0; i < msize; ++i)
        cache[hidx].minimizers[i] = minimizers[i].first;
			for (i = 0 ; i < msize - 1; ++i)
			{
      for (i = 0; i < msize - 1; ++i) {
        cache[hidx].offsets[i] = ((int)minimizers[i + 1].second>>1) - ((int)minimizers[i].second>>1);
      }
			std::vector<struct _candidate>().swap(cache[hidx].positive_candidates) ;
			std::vector<struct _candidate>().swap(cache[hidx].negative_candidates) ;
      std::vector<Candidate>().swap(cache[hidx].positive_candidates);
      std::vector<Candidate>().swap(cache[hidx].negative_candidates);
      cache[hidx].positive_candidates = pos_candidates;
      cache[hidx].negative_candidates = neg_candidates;

@@ -199,39 +173,34 @@ public:
      int size = cache[hidx].positive_candidates.size();
      int shift = (int)minimizers[0].second>>1;
      for (i = 0; i < size; ++i)
				cache[hidx].positive_candidates[i].refPos += shift ;
        cache[hidx].positive_candidates[i].position += shift;
      size = cache[hidx].negative_candidates.size();
      for (i = 0; i < size; ++i)
				cache[hidx].negative_candidates[i].refPos -= shift ;
			
        cache[hidx].negative_candidates[i].position -= shift;
    }
  }

	void DirectUpdateWeight(int idx, int weight)
	{
  void DirectUpdateWeight(int idx, int weight) {
    cache[idx].weight += weight;
  }

	uint64_t GetMemoryBytes()
	{
  uint64_t GetMemoryBytes() {
    int i;
    uint64_t ret = 0;
		for (i = 0 ; i < cache_size ; ++i)
		{
    for (i = 0; i < cache_size; ++i) {
      ret += sizeof(cache[i]) + cache[i].minimizers.capacity() * sizeof(uint64_t) 
        + cache[i].offsets.capacity() * sizeof(int)
				+ cache[i].positive_candidates.capacity() * sizeof(struct _candidate) 
				+ cache[i].negative_candidates.capacity() * sizeof(struct _candidate) ;
        + cache[i].positive_candidates.capacity() * sizeof(Candidate) 
        + cache[i].negative_candidates.capacity() * sizeof(Candidate);
    }
    return ret;
  }

	void PrintStats()
	{
  void PrintStats() {
    for (int i = 0 ; i < cache_size ; ++i)
      printf("%d %d\n", cache[i].weight, cache[i].finger_print_cnt_sum);
  }
};
} 
} // namespace chromap

#endif
Loading