Commit 31a02b34 authored by Haowen Zhang's avatar Haowen Zhang
Browse files

Merge branch 'master' into refactor

parents da883d90 67b62cd5
Loading
Loading
Loading
Loading
+138 −38
Original line number Diff line number Diff line
@@ -6,11 +6,108 @@
#define FINGER_PRINT_SIZE 103

namespace chromap {

class mm_cache_candidate_list {
private:
  uint32_t *positions;
  uint8_t *counts; // 0: the corresponding position is chr id. Otherwise, it is coordinate within the chr id
  uint32_t size;
  uint32_t actual_size;

  void Clear() {
    if (positions != NULL)
    {
      delete[] positions;
      positions = NULL;
    }
    if (counts != NULL)
    {
      delete[] counts;
      counts = NULL;
    }
    size = actual_size = 0;
  }
public:
  mm_cache_candidate_list(){
    positions = NULL;
    counts = NULL;
    size = actual_size = 0;
  }

  ~mm_cache_candidate_list() {
    Clear();
  }
  
  void Input(const std::vector<Candidate> &candidates) {
    Clear();
    int i, k;
    actual_size = candidates.size();
    if (actual_size == 0)
      return;
    size = actual_size + 1;
    
    // Collect the extra size introduced by the chr.
    for (i = 1; i < (int)actual_size; ++i) {
      if ((candidates[i].position >> 32) != (candidates[i - 1].position >> 32))
        ++size;
    }
    positions = new uint32_t[size];
    counts = new uint8_t[size];

    k = 0;
    for (i = 0; i < (int)actual_size; ++i) {
      if (i == 0 || (candidates[i].position >> 32) != (candidates[i - 1].position >> 32)) {
        counts[k] = 0;
	positions[k] = (uint32_t)(candidates[i].position >> 32);
	++k;
      }
      counts[k] = candidates[i].count;
      if (counts[k] == 0) // may happen on overflow for the tandem repeats read
        counts[k] = 1;
      positions[k] = (uint32_t)candidates[i].position;
      ++k;
    }
  }

  void Output(std::vector<Candidate> &candidates, int shift) {
    candidates.resize(actual_size);
    int i, k;
    k = 0;
    uint64_t rid = 0;
    for (i = 0; i < (int)size; ++i) {
     if (counts[i] == 0) {
       rid = (uint64_t)positions[i] << 32ull ;
     } else {
       candidates[k].position = rid | (uint32_t)(positions[i] + shift);
       candidates[k].count = counts[i];
       ++k;
     }
    }
  }

  void Shift(int offset) {
    int i;
    for (i = 0; i < (int)size; ++i) {
      if (counts[i] != 0)
        positions[i] += offset;
    }
  }

  uint32_t GetActualSize() {
    return actual_size;
  }
} ;

struct _mm_cache_entry {
  std::vector<uint64_t> minimizers;
  std::vector<int> offsets; // the distance to the next minimizer
  std::vector<Candidate> positive_candidates;
  std::vector<Candidate> negative_candidates;
  std::vector<uint8_t> strands;
  //std::vector<Candidate> positive_candidates;
  //std::vector<Candidate> negative_candidates;
 
  mm_cache_candidate_list positive_candidate_list ;
  mm_cache_candidate_list negative_candidate_list ;

  int weight;
  unsigned short finger_print_cnt[FINGER_PRINT_SIZE];
  int finger_print_cnt_sum;
@@ -32,7 +129,7 @@ class mm_cache {
    int i, j;
    int direction = 0;
    for (i = 0; i < size; ++i) {
      if (cache.minimizers[i] != minimizers[i].first)
      if (cache.minimizers[i] != minimizers[i].first || (minimizers[i].second & 1) != cache.strands[i])
        break;
    }
    if (i >= size) {
@@ -48,7 +145,7 @@ class mm_cache {
      return 1;

    for (i = 0, j = size - 1; i < size; ++i, --j) {
      if (cache.minimizers[i] != minimizers[j].first)
      if (cache.minimizers[i] != minimizers[j].first || (minimizers[j].second & 1) == cache.strands[i])
        break;
    }
    if (i >= size) {
@@ -88,35 +185,25 @@ class mm_cache {
  int Query(const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, std::vector<Candidate> &pos_candidates, std::vector<Candidate> &neg_candidates, uint32_t &repetitive_seed_length, uint32_t read_len) {
    int i;
    int msize = minimizers.size();
    if (msize == 0)
      return -1;
    uint64_t h = 0;
    for (i = 0 ; i < msize; ++i)
      h += (minimizers[i].first);
    int hidx = h % cache_size;
    int direction = IsMinimizersMatchCache(minimizers, cache[hidx]);
    if (direction == 1) {
      pos_candidates = cache[hidx].positive_candidates;
      neg_candidates = cache[hidx].negative_candidates;
      repetitive_seed_length = cache[hidx].repetitive_seed_length;
      int size = pos_candidates.size();
      int shift = (int)minimizers[0].second >> 1;
      for (i = 0; i < size; ++i)
        pos_candidates[i].position -= shift;
      size = neg_candidates.size();
      for (i = 0; i < size; ++i)
        neg_candidates[i].position += shift;
      cache[hidx].positive_candidate_list.Output(pos_candidates, -shift);
      cache[hidx].negative_candidate_list.Output(neg_candidates, shift);
      repetitive_seed_length = cache[hidx].repetitive_seed_length;
      return hidx;
    } else if (direction == -1) {// The "read" is on the other direction of the cached "read"
      int size = cache[hidx].negative_candidates.size();
      // Start position of the last minimizer shoud equal the first minimizer's end position in rc "read".
      int shift = read_len - ((int)minimizers[msize - 1].second>>1) - 1 + kmer_length - 1; 
      
      pos_candidates = cache[hidx].negative_candidates;
      for (i = 0; i < size; ++i)
        pos_candidates[i].position = cache[hidx].negative_candidates[i].position + shift - read_len + 1;
      size = cache[hidx].positive_candidates.size();
      neg_candidates = cache[hidx].positive_candidates;
      for (i = 0; i < size; ++i)
        neg_candidates[i].position = cache[hidx].positive_candidates[i].position - shift + read_len - 1;
      cache[hidx].negative_candidate_list.Output(pos_candidates, shift - read_len + 1);
      cache[hidx].positive_candidate_list.Output(neg_candidates, -shift + read_len - 1);
      repetitive_seed_length = cache[hidx].repetitive_seed_length;
      return hidx;
    } else {
@@ -127,7 +214,8 @@ class mm_cache {
  void Update(const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, std::vector<Candidate> &pos_candidates, std::vector<Candidate> &neg_candidates, uint32_t repetitive_seed_length) {
    int i;
    int msize = minimizers.size();

    if (msize == 0)
      return;
    uint64_t h = 0; // for hash
    uint64_t f = 0; // for finger printing
    for (i = 0; i < msize; ++i) {
@@ -156,29 +244,32 @@ class mm_cache {
      cache[hidx].minimizers.resize(msize);
      if (msize == 0) {
        cache[hidx].offsets.resize(0);
	cache[hidx].strands.resize(0);
        return;
      }

      cache[hidx].offsets.resize(msize - 1);
      cache[hidx].strands.resize(msize);
      for (i = 0; i < msize; ++i)
      {
        cache[hidx].minimizers[i] = minimizers[i].first;
        cache[hidx].strands[i] = (minimizers[i].second & 1);
      }
      for (i = 0; i < msize - 1; ++i) {
        cache[hidx].offsets[i] = ((int)minimizers[i + 1].second>>1) - ((int)minimizers[i].second>>1);
      }
      std::vector<Candidate>().swap(cache[hidx].positive_candidates);
      /*std::vector<Candidate>().swap(cache[hidx].positive_candidates);
      std::vector<Candidate>().swap(cache[hidx].negative_candidates);
      cache[hidx].positive_candidates = pos_candidates;
      cache[hidx].negative_candidates = neg_candidates;
      cache[hidx].negative_candidates = neg_candidates;*/
      cache[hidx].positive_candidate_list.Input(pos_candidates);
      cache[hidx].negative_candidate_list.Input(neg_candidates);
      cache[hidx].repetitive_seed_length = repetitive_seed_length;
      
      // adjust the candidate position.
      int size = cache[hidx].positive_candidates.size();
      int shift = (int)minimizers[0].second>>1;
      for (i = 0; i < size; ++i)
        cache[hidx].positive_candidates[i].position += shift;
      size = cache[hidx].negative_candidates.size();
      for (i = 0; i < size; ++i)
        cache[hidx].negative_candidates[i].position -= shift;
      cache[hidx].positive_candidate_list.Shift(shift);
      cache[hidx].negative_candidate_list.Shift(-shift);
    }
  }

@@ -187,20 +278,29 @@ class mm_cache {
  }

  uint64_t GetMemoryBytes() {
    return 0;
    int i;
    uint64_t ret = 0;
    for (i = 0; i < cache_size; ++i) {
      ret += sizeof(cache[i]) + cache[i].minimizers.capacity() * sizeof(uint64_t) 
        + cache[i].offsets.capacity() * sizeof(int)
        + cache[i].positive_candidates.capacity() * sizeof(Candidate) 
        + cache[i].negative_candidates.capacity() * sizeof(Candidate);
      ret += sizeof(cache[i]) + cache[i].minimizers.capacity() * sizeof(uint64_t) ;
        //+ cache[i].offsets.capacity() * sizeof(int)
        //+ cache[i].positive_candidates.capacity() * sizeof(Candidate) 
        //+ cache[i].negative_candidates.capacity() * sizeof(Candidate);
    }
    return ret;
  }

  void PrintStats() {
    for (int i = 0 ; i < cache_size ; ++i)
      printf("%d %d\n", cache[i].weight, cache[i].finger_print_cnt_sum);
    {
      printf("%d %d %u ", cache[i].weight, cache[i].finger_print_cnt_sum, cache[i].positive_candidate_list.GetActualSize() + 
      			cache[i].negative_candidate_list.GetActualSize()) ;
      int tmp = 0;
      for (int j = 0 ; j < FINGER_PRINT_SIZE ; ++j)
        if (cache[i].finger_print_cnt[j] > tmp)
		tmp = cache[i].finger_print_cnt[j] ;
      printf("%d\n", tmp);
    }
  }
};
} // namespace chromap