Commit 212145ea authored by Swift Genomics's avatar Swift Genomics Committed by swiftgenomics
Browse files

Use funcs instead of hardcoded bit ops.

parent 5f3094ca
Loading
Loading
Loading
Loading
+1 −1
Original line number Original line Diff line number Diff line
@@ -29,7 +29,7 @@
#include "temp_mapping.h"
#include "temp_mapping.h"
#include "utils.h"
#include "utils.h"


#define CHROMAP_VERSION "0.2.3-r437"
#define CHROMAP_VERSION "0.2.3-r446"


namespace chromap {
namespace chromap {


+38 −21
Original line number Original line Diff line number Diff line
@@ -5,8 +5,8 @@
#include <algorithm>
#include <algorithm>
#include <iostream>
#include <iostream>


#include "minimizer_utils.h"
#include "minimizer_generator.h"
#include "minimizer_generator.h"
#include "minimizer_utils.h"


namespace chromap {
namespace chromap {


@@ -39,7 +39,8 @@ void Index::Construct(uint32_t num_sequences, const SequenceBatch &reference) {


  occurrence_table_.reserve(num_minimizers);
  occurrence_table_.reserve(num_minimizers);


  uint64_t previous_key = minimizers[0].GetHashKey();
  uint64_t previous_key =
      GenerateHashKeyInLookupTable(minimizers[0].GetHashKey());
  uint32_t num_previous_minimizer_occurrences = 0;
  uint32_t num_previous_minimizer_occurrences = 0;
  uint64_t num_nonsingletons = 0;
  uint64_t num_nonsingletons = 0;
  uint32_t num_singletons = 0;
  uint32_t num_singletons = 0;
@@ -47,12 +48,14 @@ void Index::Construct(uint32_t num_sequences, const SequenceBatch &reference) {
  for (uint32_t ti = 0; ti <= num_minimizers; ++ti) {
  for (uint32_t ti = 0; ti <= num_minimizers; ++ti) {
    const bool is_last_iteration = ti == num_minimizers;
    const bool is_last_iteration = ti == num_minimizers;
    const uint64_t current_key =
    const uint64_t current_key =
        is_last_iteration ? previous_key + 1 : minimizers[ti].GetHashKey();
        is_last_iteration
            ? previous_key + 1
            : GenerateHashKeyInLookupTable(minimizers[ti].GetHashKey());


    if (current_key != previous_key) {
    if (current_key != previous_key) {
      int khash_return_code = 0;
      int khash_return_code = 0;
      khiter_t khash_iterator =
      khiter_t khash_iterator =
          kh_put(k64, lookup_table_, previous_key << 1, &khash_return_code);
          kh_put(k64, lookup_table_, previous_key, &khash_return_code);
      assert(khash_return_code != -1 && khash_return_code != 0);
      assert(khash_return_code != -1 && khash_return_code != 0);


      if (num_previous_minimizer_occurrences == 1) {
      if (num_previous_minimizer_occurrences == 1) {
@@ -65,7 +68,8 @@ void Index::Construct(uint32_t num_sequences, const SequenceBatch &reference) {
        ++num_singletons;
        ++num_singletons;
      } else {
      } else {
        kh_value(lookup_table_, khash_iterator) =
        kh_value(lookup_table_, khash_iterator) =
            (num_nonsingletons << 32) | num_previous_minimizer_occurrences;
            GenerateEntryValueInLookupTable(num_nonsingletons,
                                            num_previous_minimizer_occurrences);
        num_nonsingletons += num_previous_minimizer_occurrences;
        num_nonsingletons += num_previous_minimizer_occurrences;
      }
      }
      num_previous_minimizer_occurrences = 1;
      num_previous_minimizer_occurrences = 1;
@@ -221,7 +225,8 @@ void Index::CheckIndex(uint32_t num_sequences,
  uint32_t count = 0;
  uint32_t count = 0;
  for (uint32_t i = 0; i < minimizers.size(); ++i) {
  for (uint32_t i = 0; i < minimizers.size(); ++i) {
    khiter_t khash_iterator =
    khiter_t khash_iterator =
        kh_get(k64, lookup_table_, minimizers[i].GetHashKey() << 1);
        kh_get(k64, lookup_table_,
               GenerateHashKeyInLookupTable(minimizers[i].GetHashKey()));
    assert(khash_iterator != kh_end(lookup_table_));
    assert(khash_iterator != kh_end(lookup_table_));
    uint64_t key = kh_key(lookup_table_, khash_iterator);
    uint64_t key = kh_key(lookup_table_, khash_iterator);
    uint64_t value = kh_value(lookup_table_, khash_iterator);
    uint64_t value = kh_value(lookup_table_, khash_iterator);
@@ -229,8 +234,8 @@ void Index::CheckIndex(uint32_t num_sequences,
      assert(minimizers[i].GetHit() == value);
      assert(minimizers[i].GetHit() == value);
      count = 0;
      count = 0;
    } else {
    } else {
      uint32_t offset = value >> 32;
      uint32_t offset = GenerateOffsetInOccurrenceTable(value);
      uint32_t num_occ = value;
      uint32_t num_occ = GenerateNumOccurrenceInOccurrenceTable(value);
      uint64_t value_in_index = occurrence_table_[offset + count];
      uint64_t value_in_index = occurrence_table_[offset + count];
      assert(value_in_index == minimizers[i].GetHit());
      assert(value_in_index == minimizers[i].GetHit());
      ++count;
      ++count;
@@ -292,7 +297,7 @@ uint64_t Index::GenerateCandidatePositionForSingleSeedHit(
  const uint64_t reference_id = GenerateSequenceIndex(reference_seed_hit);
  const uint64_t reference_id = GenerateSequenceIndex(reference_seed_hit);


  const uint64_t candidate_position =
  const uint64_t candidate_position =
      (reference_id << 32) | mapping_start_position;
      GenerateCandidatePosition(reference_id, mapping_start_position);


  return candidate_position;
  return candidate_position;
}
}
@@ -334,7 +339,8 @@ int Index::GenerateCandidatePositions(


  for (uint32_t mi = 0; mi < num_minimizers; ++mi) {
  for (uint32_t mi = 0; mi < num_minimizers; ++mi) {
    khiter_t khash_iterator =
    khiter_t khash_iterator =
        kh_get(k64, lookup_table_, minimizers[mi].GetHashKey() << 1);
        kh_get(k64, lookup_table_,
               GenerateHashKeyInLookupTable(minimizers[mi].GetHashKey()));
    if (khash_iterator == kh_end(lookup_table_)) {
    if (khash_iterator == kh_end(lookup_table_)) {
      // std::cerr << "The minimizer is not in reference!\n";
      // std::cerr << "The minimizer is not in reference!\n";
      continue;
      continue;
@@ -373,8 +379,10 @@ int Index::GenerateCandidatePositions(


    const uint64_t occurrence_info = kh_value(lookup_table_, khash_iterator);
    const uint64_t occurrence_info = kh_value(lookup_table_, khash_iterator);


    const uint32_t occurrence_offset = occurrence_info >> 32;
    const uint32_t occurrence_offset =
    const uint32_t num_occurrences = occurrence_info;
        GenerateOffsetInOccurrenceTable(occurrence_info);
    const uint32_t num_occurrences =
        GenerateNumOccurrenceInOccurrenceTable(occurrence_info);


    if (!generating_config.IsFrequentSeed(num_occurrences)) {
    if (!generating_config.IsFrequentSeed(num_occurrences)) {
      for (uint32_t oi = 0; oi < num_occurrences; ++oi) {
      for (uint32_t oi = 0; oi < num_occurrences; ++oi) {
@@ -532,7 +540,8 @@ int Index::GenerateCandidatePositionsFromRepetitiveReadWithMateInfoOnOneStrand(


  for (uint32_t mi = 0; mi < minimizers.size(); ++mi) {
  for (uint32_t mi = 0; mi < minimizers.size(); ++mi) {
    khiter_t khash_iterator =
    khiter_t khash_iterator =
        kh_get(k64, lookup_table_, minimizers[mi].GetHashKey() << 1);
        kh_get(k64, lookup_table_,
               GenerateHashKeyInLookupTable(minimizers[mi].GetHashKey()));
    if (khash_iterator == kh_end(lookup_table_)) {
    if (khash_iterator == kh_end(lookup_table_)) {
      // std::cerr << "The minimizer is not in reference!\n";
      // std::cerr << "The minimizer is not in reference!\n";
      continue;
      continue;
@@ -558,21 +567,24 @@ int Index::GenerateCandidatePositionsFromRepetitiveReadWithMateInfoOnOneStrand(
        if (strand == kPositive) {
        if (strand == kPositive) {
          const uint32_t candidate_position =
          const uint32_t candidate_position =
              reference_position - read_position;
              reference_position - read_position;
          const uint64_t seed_hit = (reference_id << 32) | candidate_position;
          const uint64_t seed_hit =
              GenerateCandidatePosition(reference_id, candidate_position);
          hits.push_back(seed_hit);
          hits.push_back(seed_hit);
        }
        }
      } else if (strand == kNegative) {
      } else if (strand == kNegative) {
        const uint32_t candidate_position =
        const uint32_t candidate_position =
            reference_position + read_position - kmer_size_ + 1;
            reference_position + read_position - kmer_size_ + 1;
        const uint64_t seed_hit = (reference_id << 32) | candidate_position;
        const uint64_t seed_hit =
            GenerateCandidatePosition(reference_id, candidate_position);
        hits.push_back(seed_hit);
        hits.push_back(seed_hit);
      }
      }


      continue;
      continue;
    }
    }


    const uint32_t offset = reference_seed_hit >> 32;
    const uint32_t offset = GenerateOffsetInOccurrenceTable(reference_seed_hit);
    const uint32_t num_occurrences = reference_seed_hit;
    const uint32_t num_occurrences =
        GenerateNumOccurrenceInOccurrenceTable(reference_seed_hit);
    int32_t prev_l = 0;
    int32_t prev_l = 0;
    for (uint32_t bi = 0; bi < boundary_size; ++bi) {
    for (uint32_t bi = 0; bi < boundary_size; ++bi) {
      // use binary search to locate the coordinate near mate position
      // use binary search to locate the coordinate near mate position
@@ -581,7 +593,9 @@ int Index::GenerateCandidatePositionsFromRepetitiveReadWithMateInfoOnOneStrand(
      while (l <= r) {
      while (l <= r) {
        m = (l + r) / 2;
        m = (l + r) / 2;


        uint64_t reference_seed_hit = (occurrence_table_[offset + m]) >> 1;
        uint64_t reference_seed_hit =
            GenerateCandidatePositionFromOccurrenceTableEntry(
                occurrence_table_[offset + m]);


        if (reference_seed_hit < boundary) {
        if (reference_seed_hit < boundary) {
          l = m + 1;
          l = m + 1;
@@ -596,7 +610,8 @@ int Index::GenerateCandidatePositionsFromRepetitiveReadWithMateInfoOnOneStrand(


      for (uint32_t oi = m; oi < num_occurrences; ++oi) {
      for (uint32_t oi = m; oi < num_occurrences; ++oi) {
        const uint64_t reference_seed_hit = occurrence_table_[offset + oi];
        const uint64_t reference_seed_hit = occurrence_table_[offset + oi];
        if ((reference_seed_hit >> 1) > boundaries[bi].second) {
        if ((GenerateCandidatePositionFromOccurrenceTableEntry(
                reference_seed_hit)) > boundaries[bi].second) {
          break;
          break;
        }
        }


@@ -610,13 +625,15 @@ int Index::GenerateCandidatePositionsFromRepetitiveReadWithMateInfoOnOneStrand(
          if (strand == kPositive) {
          if (strand == kPositive) {
            const uint32_t candidate_position =
            const uint32_t candidate_position =
                reference_position - read_position;
                reference_position - read_position;
            const uint64_t seed_hit = (reference_id << 32) | candidate_position;
            const uint64_t seed_hit =
                GenerateCandidatePosition(reference_id, candidate_position);
            hits.push_back(seed_hit);
            hits.push_back(seed_hit);
          }
          }
        } else if (strand == kNegative) {
        } else if (strand == kNegative) {
          const uint32_t candidate_position =
          const uint32_t candidate_position =
              reference_position + read_position - kmer_size_ + 1;
              reference_position + read_position - kmer_size_ + 1;
          const uint64_t seed_hit = (reference_id << 32) | candidate_position;
          const uint64_t seed_hit =
              GenerateCandidatePosition(reference_id, candidate_position);
          hits.push_back(seed_hit);
          hits.push_back(seed_hit);
        }
        }
      }
      }
+1 −12
Original line number Original line Diff line number Diff line
@@ -8,7 +8,7 @@


#include "candidate_position_generating_config.h"
#include "candidate_position_generating_config.h"
#include "index_parameters.h"
#include "index_parameters.h"
#include "khash.h"
#include "index_utils.h"
#include "mapping_metadata.h"
#include "mapping_metadata.h"
#include "minimizer.h"
#include "minimizer.h"
#include "sequence_batch.h"
#include "sequence_batch.h"
@@ -16,17 +16,6 @@


namespace chromap {
namespace chromap {


// Note that the max kmer size is 28 and its hash value is always saved in the
// lowest 56 bits of an unsigned 64-bit integer. When an element is inserted
// into the hash table, its hash value is left shifted by 1 bit and the lowest
// bit of the key value is set to 1 when the minimizer only occurs once. So
// right shift by one bit is lossless and safe.
#define KHashFunctionForIndex(a) ((a) >> 1)
#define KHashEqForIndex(a, b) ((a) >> 1 == (b) >> 1)
KHASH_INIT(/*name=*/k64, /*khkey_t=*/uint64_t, /*khval_t=*/uint64_t,
           /*kh_is_map=*/1, /*__hash_func=*/KHashFunctionForIndex,
           /*__hash_equal=*/KHashEqForIndex);

class Index {
class Index {
 public:
 public:
  Index() = delete;
  Index() = delete;

src/index_utils.h

0 → 100644
+51 −0
Original line number Original line Diff line number Diff line
#ifndef INDEX_UTILS_H_
#define INDEX_UTILSH_

#include "khash.h"

// Note that the max kmer size is 28 and its hash value is always saved in the
// lowest 56 bits of an unsigned 64-bit integer. When an element is inserted
// into the hash table, its hash value is left shifted by 1 bit and the lowest
// bit of the key value is set to 1 when the minimizer only occurs once. So
// right shift by one bit is lossless and safe.
#define KHashFunctionForIndex(a) ((a) >> 1)
#define KHashEqForIndex(a, b) ((a) >> 1 == (b) >> 1)
KHASH_INIT(/*name=*/k64, /*khkey_t=*/uint64_t, /*khval_t=*/uint64_t,
           /*kh_is_map=*/1, /*__hash_func=*/KHashFunctionForIndex,
           /*__hash_equal=*/KHashEqForIndex);

namespace chromap {

inline static uint64_t GenerateHashKeyInLookupTable(
    uint64_t minimizer_hash_key) {
  return minimizer_hash_key << 1;
}

inline static uint64_t GenerateEntryValueInLookupTable(
    uint64_t occurrence_table_offset, uint32_t num_occurrences) {
  return (occurrence_table_offset << 32) | num_occurrences;
}

inline static uint32_t GenerateOffsetInOccurrenceTable(
    uint64_t lookup_table_entry_value) {
  return lookup_table_entry_value >> 32;
}

inline static uint32_t GenerateNumOccurrenceInOccurrenceTable(
    uint64_t lookup_table_entry_value) {
  return static_cast<uint32_t>(lookup_table_entry_value);
}

inline static uint64_t GenerateCandidatePosition(uint64_t sequence_id,
                                                 uint32_t sequence_position) {
  return (sequence_id << 32) | sequence_position;
}

inline static uint64_t GenerateCandidatePositionFromOccurrenceTableEntry(
    uint64_t entry) {
  return entry >> 1;
}

}  // namespace chromap

#endif  // INDEX_UTILS_H_