Commit 70059f8f authored by Haowen Zhang's avatar Haowen Zhang
Browse files

Format code for index.

parent 3812b8e6
Loading
Loading
Loading
Loading
+228 −212
Original line number Diff line number Diff line
@@ -9,182 +9,33 @@

namespace chromap {

void Index::Statistics(uint32_t num_sequences,
                       const SequenceBatch &reference) const {
  double real_start_time = GetRealTime();
  int n = 0, n1 = 0;
  uint32_t i;
  uint64_t sum = 0, len = 0;
  fprintf(stderr, "[M::%s] kmer size: %d; skip: %d; #seq: %d\n", __func__,
          kmer_size_, window_size_, num_sequences);
  for (i = 0; i < num_sequences; ++i) {
    len += reference.GetSequenceLengthAt(i);
  }
  assert(len == reference.GetNumBases());
  if (lookup_table_) {
    n += kh_size(lookup_table_);
  }
  for (khint_t k = 0; k < kh_end(lookup_table_); ++k) {
    if (kh_exist(lookup_table_, k)) {
      sum +=
          kh_key(lookup_table_, k) & 1 ? 1 : (uint32_t)kh_val(lookup_table_, k);
      if (kh_key(lookup_table_, k) & 1) ++n1;
    }
  }
  fprintf(stderr,
          "[M::%s::%.3f] distinct minimizers: %d (%.2f%% are singletons); "
          "average occurrences: %.3lf; average spacing: %.3lf\n",
          __func__, GetRealTime() - real_start_time, n, 100.0 * n1 / n,
          (double)sum / n, (double)len / sum);
}

// always reserve space for minimizers in other functions
void Index::GenerateMinimizerSketch(
    const SequenceBatch &sequence_batch, uint32_t sequence_index,
    std::vector<std::pair<uint64_t, uint64_t> > &minimizers) const {
  uint64_t num_shifted_bits = 2 * (kmer_size_ - 1);
  uint64_t mask = (((uint64_t)1) << (2 * kmer_size_)) - 1;
  uint64_t seeds_in_two_strands[2] = {0, 0};
  std::pair<uint64_t, uint64_t> buffer[256];
  std::pair<uint64_t, uint64_t> min_seed = {UINT64_MAX, UINT64_MAX};
  uint32_t sequence_length = sequence_batch.GetSequenceLengthAt(sequence_index);
  const char *sequence = sequence_batch.GetSequenceAt(sequence_index);
  assert(sequence_length > 0 && (window_size_ > 0 && window_size_ < 256) &&
         (kmer_size_ > 0 &&
          kmer_size_ <= 28));  // 56 bits for k-mer; could use long k-mers, but
                               // 28 enough in practice
  memset(buffer, 0xff, window_size_ * 16);  // 2 uint64_t cost 16 bytes
  int unambiguous_length = 0;
  int position_in_buffer = 0;
  int min_position = 0;
  // std::pair<uint64_t, uint64_t> pre_seed = {UINT64_MAX, UINT64_MAX};
  for (uint32_t position = 0; position < sequence_length; ++position) {
    uint8_t current_base = SequenceBatch::CharToUint8(sequence[position]);
    std::pair<uint64_t, uint64_t> current_seed = {UINT64_MAX, UINT64_MAX};
    if (current_base < 4) {  // not an ambiguous base
      seeds_in_two_strands[0] =
          ((seeds_in_two_strands[0] << 2) | current_base) &
          mask;  // forward k-mer
      seeds_in_two_strands[1] = (seeds_in_two_strands[1] >> 2) |
                                (((uint64_t)(3 ^ current_base))
                                 << num_shifted_bits);  // reverse k-mer
      if (seeds_in_two_strands[0] == seeds_in_two_strands[1]) {
        continue;  // skip "symmetric k-mers" as we don't know it strand
      }
      uint64_t hash_keys_for_two_seeds[2] = {
          Hash64(seeds_in_two_strands[0], mask),
          Hash64(seeds_in_two_strands[1], mask)};
      uint64_t strand = hash_keys_for_two_seeds[0] < hash_keys_for_two_seeds[1]
                            ? 0
                            : 1;  // strand
      // uint64_t strand = seeds_in_two_strands[0] < seeds_in_two_strands[1] ? 0
      // : 1; // strand
      ++unambiguous_length;
      if (unambiguous_length >= kmer_size_) {
        // current_seed.first = Hash64(seeds_in_two_strands[strand], mask);
        current_seed.first = Hash64(hash_keys_for_two_seeds[strand], mask);
        current_seed.second =
            ((((uint64_t)sequence_index) << 32 | (uint32_t)position) << 1) |
            strand;
      }
    } else {
      unambiguous_length = 0;
    }
    buffer[position_in_buffer] =
        current_seed;  // need to do this here as appropriate position_in_buffer
                       // and buf[position_in_buffer] are needed below
    if (unambiguous_length == window_size_ + kmer_size_ - 1 &&
        min_seed.first != UINT64_MAX &&
        min_seed.first <
            current_seed
                .first) {  // special case for the first window - because
                           // identical k-mers are not stored yet
      for (int j = position_in_buffer + 1; j < window_size_; ++j)
        if (min_seed.first == buffer[j].first &&
            buffer[j].second != min_seed.second)
          minimizers.push_back(buffer[j]);
      for (int j = 0; j < position_in_buffer; ++j)
        if (min_seed.first == buffer[j].first &&
            buffer[j].second != min_seed.second)
          minimizers.push_back(buffer[j]);
    }

    if (current_seed.first <=
        min_seed.first) {  // a new minimum; then write the old min
      if (unambiguous_length >= window_size_ + kmer_size_ &&
          min_seed.first != UINT64_MAX) {
        minimizers.push_back(min_seed);
      }
      min_seed = current_seed;
      min_position = position_in_buffer;
    } else if (position_in_buffer ==
               min_position) {  // old min has moved outside the window
      if (unambiguous_length >= window_size_ + kmer_size_ - 1 &&
          min_seed.first != UINT64_MAX) {
        minimizers.push_back(min_seed);
      }
      min_seed.first = UINT64_MAX;
      for (int j = position_in_buffer + 1; j < window_size_;
           ++j) {  // the two loops are necessary when there are identical
                   // k-mers
        if (min_seed.first >= buffer[j].first) {  // >= is important s.t. min is
                                                  // always the closest k-mer
          min_seed = buffer[j];
          min_position = j;
        }
      }
      for (int j = 0; j <= position_in_buffer; ++j) {
        if (min_seed.first >= buffer[j].first) {
          min_seed = buffer[j];
          min_position = j;
        }
      }
      if (unambiguous_length >= window_size_ + kmer_size_ - 1 &&
          min_seed.first != UINT64_MAX) {  // write identical k-mers
        for (int j = position_in_buffer + 1; j < window_size_;
             ++j)  // these two loops make sure the output is sorted
          if (min_seed.first == buffer[j].first &&
              min_seed.second != buffer[j].second)
            minimizers.push_back(buffer[j]);
        for (int j = 0; j <= position_in_buffer; ++j)
          if (min_seed.first == buffer[j].first &&
              min_seed.second != buffer[j].second)
            minimizers.push_back(buffer[j]);
      }
    }
    ++position_in_buffer;
    if (position_in_buffer == window_size_) {
      position_in_buffer = 0;
    }
  }
  if (min_seed.first != UINT64_MAX) {
    minimizers.push_back(min_seed);
  }
}

void Index::Construct(uint32_t num_sequences, const SequenceBatch &reference) {
  double real_start_time = GetRealTime();
  // tmp_table stores (minimizer, position)
  // tmp_table stores (minimizer, position).
  std::vector<std::pair<uint64_t, uint64_t> > tmp_table;
  tmp_table.reserve(reference.GetNumBases() / window_size_ * 2);

  for (uint32_t sequence_index = 0; sequence_index < num_sequences;
       ++sequence_index) {
    GenerateMinimizerSketch(reference, sequence_index, tmp_table);
  }
  std::cerr << "Collected " << tmp_table.size() << " minimizers.\n";

  std::stable_sort(tmp_table.begin(), tmp_table.end());
  std::cerr << "Sorted minimizers.\n";

  uint32_t num_minimizers = tmp_table.size();
  assert(num_minimizers != 0 &&
         num_minimizers <=
             INT_MAX);  // Here I make sure the # minimizers is less than the
                        // limit of signed int32, so that I can use int to store
                        // position later.

  // Here I make sure the # minimizers is less than the limit of signed int32,
  // so that I can use int to store position later.
  assert(num_minimizers != 0 && num_minimizers <= INT_MAX);
  occurrence_table_.reserve(num_minimizers);

  uint64_t previous_key = tmp_table[0].first;
  uint32_t num_previous_minimizer_occurrences = 0;
  uint64_t num_nonsingletons = 0;
  uint32_t num_singletons = 0;

  for (uint32_t ti = 0; ti < num_minimizers; ++ti) {
    uint64_t current_key = tmp_table[ti].first;
    if (current_key != previous_key) {
@@ -209,10 +60,12 @@ void Index::Construct(uint32_t num_sequences, const SequenceBatch &reference) {
    occurrence_table_.push_back(tmp_table[ti].second);
    previous_key = current_key;
  }

  int khash_return_code;
  khiter_t khash_iterator =
      kh_put(k64, lookup_table_, previous_key << 1, &khash_return_code);
  assert(khash_return_code != -1 && khash_return_code != 0);

  if (num_previous_minimizer_occurrences == 1) {  // singleton
    kh_key(lookup_table_, khash_iterator) |= 1;
    kh_value(lookup_table_, khash_iterator) = occurrence_table_.back();
@@ -224,6 +77,7 @@ void Index::Construct(uint32_t num_sequences, const SequenceBatch &reference) {
    num_nonsingletons += num_previous_minimizer_occurrences;
  }
  assert(num_nonsingletons + num_singletons == num_minimizers);

  std::cerr << "Kmer size: " << kmer_size_ << ", window size: " << window_size_
            << ".\n";
  std::cerr << "Lookup table size: " << kh_size(lookup_table_)
@@ -234,44 +88,10 @@ void Index::Construct(uint32_t num_sequences, const SequenceBatch &reference) {
            << "s.\n";
}

void Index::CheckIndex(uint32_t num_sequences,
                       const SequenceBatch &reference) const {
  std::vector<std::pair<uint64_t, uint64_t> > tmp_table;
  tmp_table.reserve(reference.GetNumBases() / window_size_ * 2);
  for (uint32_t sequence_index = 0; sequence_index < num_sequences;
       ++sequence_index) {
    GenerateMinimizerSketch(reference, sequence_index, tmp_table);
  }
  std::cerr << "Collected " << tmp_table.size() << " minimizers.\n";
  std::stable_sort(tmp_table.begin(), tmp_table.end());
  std::cerr << "Sorted minimizers.\n";
  uint32_t count = 0;
  for (uint32_t i = 0; i < tmp_table.size(); ++i) {
    khiter_t khash_iterator =
        kh_get(k64, lookup_table_, tmp_table[i].first << 1);
    assert(khash_iterator != kh_end(lookup_table_));
    uint64_t key = kh_key(lookup_table_, khash_iterator);
    uint64_t value = kh_value(lookup_table_, khash_iterator);
    if (key & 1) {  // singleton
      assert(tmp_table[i].second == value);
      count = 0;
    } else {
      uint32_t offset = value >> 32;
      uint32_t num_occ = value;
      uint64_t value_in_index = occurrence_table_[offset + count];
      assert(value_in_index == tmp_table[i].second);
      ++count;
      if (count == num_occ) {
        count = 0;
      }
    }
  }
}

void Index::Save() const {
  double real_start_time = GetRealTime();
  FILE *index_file = fopen(index_file_path_.c_str(), "wb");
  assert(index_file != NULL);
  assert(index_file != nullptr);
  uint64_t num_bytes = 0;
  int err = 0;
  err = fwrite(&kmer_size_, sizeof(int), 1, index_file);
@@ -305,7 +125,7 @@ void Index::Save() const {
void Index::Load() {
  double real_start_time = GetRealTime();
  FILE *index_file = fopen(index_file_path_.c_str(), "rb");
  assert(index_file != NULL);
  assert(index_file != nullptr);
  int err = 0;
  err = fread(&kmer_size_, sizeof(int), 1, index_file);
  assert(err != 0);
@@ -333,7 +153,69 @@ void Index::Load() {
            << GetRealTime() - real_start_time << "s.\n";
}

// Return the number of repetitive seeds.
void Index::Statistics(uint32_t num_sequences,
                       const SequenceBatch &reference) const {
  double real_start_time = GetRealTime();
  int n = 0, n1 = 0;
  uint32_t i;
  uint64_t sum = 0, len = 0;
  fprintf(stderr, "[M::%s] kmer size: %d; skip: %d; #seq: %d\n", __func__,
          kmer_size_, window_size_, num_sequences);
  for (i = 0; i < num_sequences; ++i) {
    len += reference.GetSequenceLengthAt(i);
  }
  assert(len == reference.GetNumBases());
  if (lookup_table_) {
    n += kh_size(lookup_table_);
  }
  for (khint_t k = 0; k < kh_end(lookup_table_); ++k) {
    if (kh_exist(lookup_table_, k)) {
      sum +=
          kh_key(lookup_table_, k) & 1 ? 1 : (uint32_t)kh_val(lookup_table_, k);
      if (kh_key(lookup_table_, k) & 1) ++n1;
    }
  }
  fprintf(stderr,
          "[M::%s::%.3f] distinct minimizers: %d (%.2f%% are singletons); "
          "average occurrences: %.3lf; average spacing: %.3lf\n",
          __func__, GetRealTime() - real_start_time, n, 100.0 * n1 / n,
          (double)sum / n, (double)len / sum);
}

void Index::CheckIndex(uint32_t num_sequences,
                       const SequenceBatch &reference) const {
  std::vector<std::pair<uint64_t, uint64_t> > tmp_table;
  tmp_table.reserve(reference.GetNumBases() / window_size_ * 2);
  for (uint32_t sequence_index = 0; sequence_index < num_sequences;
       ++sequence_index) {
    GenerateMinimizerSketch(reference, sequence_index, tmp_table);
  }
  std::cerr << "Collected " << tmp_table.size() << " minimizers.\n";
  std::stable_sort(tmp_table.begin(), tmp_table.end());
  std::cerr << "Sorted minimizers.\n";
  uint32_t count = 0;
  for (uint32_t i = 0; i < tmp_table.size(); ++i) {
    khiter_t khash_iterator =
        kh_get(k64, lookup_table_, tmp_table[i].first << 1);
    assert(khash_iterator != kh_end(lookup_table_));
    uint64_t key = kh_key(lookup_table_, khash_iterator);
    uint64_t value = kh_value(lookup_table_, khash_iterator);
    if (key & 1) {  // singleton
      assert(tmp_table[i].second == value);
      count = 0;
    } else {
      uint32_t offset = value >> 32;
      uint32_t num_occ = value;
      uint64_t value_in_index = occurrence_table_[offset + count];
      assert(value_in_index == tmp_table[i].second);
      ++count;
      if (count == num_occ) {
        count = 0;
      }
    }
  }
}

int Index::CollectSeedHits(
    int max_seed_frequency, int repetitive_seed_frequency,
    const std::vector<std::pair<uint64_t, uint64_t> > &minimizers,
@@ -349,6 +231,7 @@ int Index::CollectSeedHits(
  }
  positive_hits.reserve(max_seed_frequency * 2);
  negative_hits.reserve(max_seed_frequency * 2);

  uint32_t previous_repetitive_seed_position =
      std::numeric_limits<uint32_t>::max();
  for (uint32_t mi = 0; mi < num_minimizers; ++mi) {
@@ -358,6 +241,7 @@ int Index::CollectSeedHits(
      // std::cerr << "The minimizer is not in reference!\n";
      continue;
    }

    uint64_t value = kh_value(lookup_table_, khash_iterator);
    uint32_t read_position = minimizers[mi].second >> 1;
    if (kh_key(lookup_table_, khash_iterator) & 1) {  // singleton
@@ -422,6 +306,7 @@ int Index::CollectSeedHits(
          }
        }
      }

      if (num_occurrences >= (uint32_t)repetitive_seed_frequency) {
        if (previous_repetitive_seed_position >
            read_position) {  // first minimizer
@@ -447,7 +332,7 @@ int Index::CollectSeedHits(
    positive_hits.clear();
    for (uint32_t mi = 0; mi < num_minimizers; ++mi) {
      if (mm_positive_hits[mi].size() == 0) continue;
      // only the positive part may have the underflow issue
      // Only the positive part may have the underflow issue.
      if (heap_resort)
        std::sort(mm_positive_hits[mi].begin(), mm_positive_hits[mi].end());
      struct mmHit nh;
@@ -479,6 +364,7 @@ int Index::CollectSeedHits(
      heap.push(nh);
      mm_pos[mi] = 0;
    }

    while (!heap.empty()) {
      struct mmHit top = heap.top();
      heap.pop();
@@ -491,6 +377,7 @@ int Index::CollectSeedHits(
        heap.push(nh);
      }
    }

    delete[] mm_positive_hits;
    delete[] mm_negative_hits;
    delete[] mm_pos;
@@ -498,20 +385,20 @@ int Index::CollectSeedHits(
    std::sort(positive_hits.begin(), positive_hits.end());
    std::sort(negative_hits.begin(), negative_hits.end());
  }
  /*for (uint32_t mi = 0 ; mi < positive_hits->size() ; ++mi)

#ifdef LI_DEBUG
  for (uint32_t mi = 0; mi < positive_hits->size(); ++mi)
    printf("+ %llu %d %d\n", positive_hits->at(mi),
           (int)(positive_hits->at(mi) >> 32), (int)(positive_hits->at(mi)));

  for (uint32_t mi = 0; mi < negative_hits->size(); ++mi)
    printf("- %llu %d %d\n", negative_hits->at(mi),
    (int)(negative_hits->at(mi)>>32), (int)(negative_hits->at(mi))) ;*/
           (int)(negative_hits->at(mi) >> 32), (int)(negative_hits->at(mi)));
#endif

  return repetitive_seed_count;
}

// Input a search range, for each best mate candidate, serach for minimizer
// hits. Return the minimizer count of the best candidate, if it finishes
// normally or return a negative value if it stops early due to too many
// candidates with low minimizer count.
int Index::CollectSeedHitsFromRepetitiveReadWithMateInfo(
    int error_threshold,
    const std::vector<std::pair<uint64_t, uint64_t> > &minimizers,
@@ -535,9 +422,11 @@ int Index::CollectSeedHitsFromRepetitiveReadWithMateInfo(
  const bool mate_has_too_many_candidates =
      best_candidate_num >= 300 ||
      mate_candidates_size > (uint32_t)max_seed_frequency0;

  const bool mate_has_too_many_low_support_candidates =
      max_minimizer_count <= min_num_seeds_required_for_mapping &&
      best_candidate_num >= 200;

  if (mate_has_too_many_candidates ||
      mate_has_too_many_low_support_candidates) {
    return -max_minimizer_count;
@@ -545,6 +434,7 @@ int Index::CollectSeedHitsFromRepetitiveReadWithMateInfo(

  std::vector<std::pair<uint64_t, uint64_t> > boundaries;
  boundaries.reserve(300);

  for (uint32_t ci = 0; ci < mate_candidates_size; ++ci) {
    if (mate_candidates[ci].count == max_minimizer_count) {
      const uint64_t boundary_start =
@@ -677,16 +567,142 @@ int Index::CollectSeedHitsFromRepetitiveReadWithMateInfo(
  }    // for mi

  std::sort(hits.begin(), hits.end());
  // for (uint32_t i = 0 ; i < hits->size(); ++i)
  //	  printf("%s: %d %d\n", __func__,
  //(int)(hits->at(i)>>32),(int)hits->at(i)); std::cerr << "Rescue gen on one
  // dir\n";

  // GenerateCandidatesOnOneDirection(error_threshold, /*num_seeds_required=*/1,
  //                                 minimizers.size(), hits, candidates);
#ifdef LI_DEBUG
  for (uint32_t i = 0; i < hits->size(); ++i)
    printf("%s: %d %d\n", __func__, (int)(hits->at(i) >> 32), (int)hits->at(i));
  std::cerr << "Rescue gen on one dir\n ";
  printf("%s: %d\n", __func__, hits->size());
#endif

  // printf("%s: %d %d\n", __func__, hits->size(), candidates->size()) ;
  return max_minimizer_count;
}

void Index::GenerateMinimizerSketch(
    const SequenceBatch &sequence_batch, uint32_t sequence_index,
    std::vector<std::pair<uint64_t, uint64_t> > &minimizers) const {
  uint64_t num_shifted_bits = 2 * (kmer_size_ - 1);
  uint64_t mask = (((uint64_t)1) << (2 * kmer_size_)) - 1;
  uint64_t seeds_in_two_strands[2] = {0, 0};
  std::pair<uint64_t, uint64_t> buffer[256];
  std::pair<uint64_t, uint64_t> min_seed = {UINT64_MAX, UINT64_MAX};
  uint32_t sequence_length = sequence_batch.GetSequenceLengthAt(sequence_index);
  const char *sequence = sequence_batch.GetSequenceAt(sequence_index);

  // 56 bits for k-mer; could use long k-mers, but 28 enough in practice.
  assert(sequence_length > 0 && (window_size_ > 0 && window_size_ < 256) &&
         (kmer_size_ > 0 && kmer_size_ <= 28));
  memset(buffer, 0xff, window_size_ * 16);  // 2 uint64_t cost 16 bytes
  int unambiguous_length = 0;
  int position_in_buffer = 0;
  int min_position = 0;

  for (uint32_t position = 0; position < sequence_length; ++position) {
    uint8_t current_base = SequenceBatch::CharToUint8(sequence[position]);
    std::pair<uint64_t, uint64_t> current_seed = {UINT64_MAX, UINT64_MAX};
    if (current_base < 4) {
      // Not an ambiguous base.
      // Forward k-mer.
      seeds_in_two_strands[0] =
          ((seeds_in_two_strands[0] << 2) | current_base) & mask;
      // Reverse k-mer.
      seeds_in_two_strands[1] =
          (seeds_in_two_strands[1] >> 2) |
          (((uint64_t)(3 ^ current_base)) << num_shifted_bits);

      if (seeds_in_two_strands[0] == seeds_in_two_strands[1]) {
        // Skip "symmetric k-mers" as we don't know it strand.
        continue;
      }

      uint64_t hash_keys_for_two_seeds[2] = {
          Hash64(seeds_in_two_strands[0], mask),
          Hash64(seeds_in_two_strands[1], mask)};

      uint64_t strand =
          hash_keys_for_two_seeds[0] < hash_keys_for_two_seeds[1] ? 0 : 1;

      ++unambiguous_length;

      if (unambiguous_length >= kmer_size_) {
        current_seed.first = Hash64(hash_keys_for_two_seeds[strand], mask);
        current_seed.second =
            ((((uint64_t)sequence_index) << 32 | (uint32_t)position) << 1) |
            strand;
      }
    } else {
      unambiguous_length = 0;
    }

    // Need to do this here as appropriate position_in_buffer and
    // buf[position_in_buffer] are needed below.
    buffer[position_in_buffer] = current_seed;
    if (unambiguous_length == window_size_ + kmer_size_ - 1 &&
        min_seed.first != UINT64_MAX && min_seed.first < current_seed.first) {
      // Special case for the first window - because identical k-mers are not
      // stored yet.
      for (int j = position_in_buffer + 1; j < window_size_; ++j)
        if (min_seed.first == buffer[j].first &&
            buffer[j].second != min_seed.second)
          minimizers.push_back(buffer[j]);
      for (int j = 0; j < position_in_buffer; ++j)
        if (min_seed.first == buffer[j].first &&
            buffer[j].second != min_seed.second)
          minimizers.push_back(buffer[j]);
    }

    if (current_seed.first <= min_seed.first) {
      // A new minimum; then write the old min.
      if (unambiguous_length >= window_size_ + kmer_size_ &&
          min_seed.first != UINT64_MAX) {
        minimizers.push_back(min_seed);
      }
      min_seed = current_seed;
      min_position = position_in_buffer;
    } else if (position_in_buffer == min_position) {
      // Old min has moved outside the window.
      if (unambiguous_length >= window_size_ + kmer_size_ - 1 &&
          min_seed.first != UINT64_MAX) {
        minimizers.push_back(min_seed);
      }
      min_seed.first = UINT64_MAX;
      for (int j = position_in_buffer + 1; j < window_size_; ++j) {
        // The two loops are necessary when there are identical k-mers.
        if (min_seed.first >= buffer[j].first) {
          // >= is important s.t. min is always the closest k-mer.
          min_seed = buffer[j];
          min_position = j;
        }
      }
      for (int j = 0; j <= position_in_buffer; ++j) {
        if (min_seed.first >= buffer[j].first) {
          min_seed = buffer[j];
          min_position = j;
        }
      }
      if (unambiguous_length >= window_size_ + kmer_size_ - 1 &&
          min_seed.first != UINT64_MAX) {
        // Write identical k-mers.
        // These two loops make sure the output is sorted.
        for (int j = position_in_buffer + 1; j < window_size_; ++j)
          if (min_seed.first == buffer[j].first &&
              min_seed.second != buffer[j].second)
            minimizers.push_back(buffer[j]);
        for (int j = 0; j <= position_in_buffer; ++j)
          if (min_seed.first == buffer[j].first &&
              min_seed.second != buffer[j].second)
            minimizers.push_back(buffer[j]);
      }
    }
    ++position_in_buffer;
    if (position_in_buffer == window_size_) {
      position_in_buffer = 0;
    }
  }

  if (min_seed.first != UINT64_MAX) {
    minimizers.push_back(min_seed);
  }
}

}  // namespace chromap
+11 −3
Original line number Diff line number Diff line
@@ -6,9 +6,9 @@
#include <string>
#include <vector>

#include "index_parameters.h"
#include "khash.h"
#include "mapping_metadata.h"
#include "index_parameters.h"
#include "sequence_batch.h"
#include "utils.h"

@@ -40,9 +40,9 @@ class Index {
  ~Index() { Destroy(); }

  void Destroy() {
    if (lookup_table_ != NULL) {
    if (lookup_table_ != nullptr) {
      kh_destroy(k64, lookup_table_);
      lookup_table_ = NULL;
      lookup_table_ = nullptr;
    }

    std::vector<uint64_t>().swap(occurrence_table_);
@@ -54,16 +54,23 @@ class Index {

  void Load();

  // Output index stats.
  void Statistics(uint32_t num_sequences, const SequenceBatch &reference) const;

  // Check the index for some reference genome. Only for debug.
  void CheckIndex(uint32_t num_sequences, const SequenceBatch &reference) const;

  // Return the number of repetitive seeds.
  int CollectSeedHits(
      int max_seed_frequency, int repetitive_seed_frequency,
      const std::vector<std::pair<uint64_t, uint64_t> > &minimizers,
      uint32_t &repetitive_seed_length, std::vector<uint64_t> &positive_hits,
      std::vector<uint64_t> &negative_hits, bool use_heap) const;

  // Input a search range, for each best mate candidate, serach for minimizer
  // hits. Return the minimizer count of the best candidate, if it finishes
  // normally or return a negative value if it stops early due to too many
  // candidates with low minimizer count.
  int CollectSeedHitsFromRepetitiveReadWithMateInfo(
      int error_threshold,
      const std::vector<std::pair<uint64_t, uint64_t> > &minimizers,
@@ -79,6 +86,7 @@ class Index {
  uint32_t GetLookupTableSize() const { return kh_size(lookup_table_); }

  // TODO(Haowen): move this out to form a minimizer class or struct.
  // One should always reserve space for minimizers in other functions.
  void GenerateMinimizerSketch(
      const SequenceBatch &sequence_batch, uint32_t sequence_index,
      std::vector<std::pair<uint64_t, uint64_t> > &minimizers) const;