Commit 7f283b6f authored by Haowen Zhang's avatar Haowen Zhang Committed by Li Song
Browse files

Fix a bug for hashing 32 bp sequences.

parent e8851a40
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
@@ -10,7 +10,7 @@
#include <vector>

#include "khash.h"
#include "sequence_batch.h"
#include "utils.h"

namespace chromap {

@@ -108,8 +108,8 @@ class BarcodeTranslator {
    to = line.substr(0, i);
    // from = line.substr(i + 1, len - i - 1);
    from_bc_length_ = len - i - 1;
    uint64_t from_seed = SequenceBatch::GenerateSeedFromSequence(
        line.c_str(), len, i + 1, from_bc_length_);
    uint64_t from_seed =
        GenerateSeedFromSequence(line.c_str(), len, i + 1, from_bc_length_);

    int khash_return_code;
    khiter_t barcode_translate_table_iter = kh_put(
+1 −1
Original line number Diff line number Diff line
@@ -357,7 +357,7 @@ void Chromap::LoadBarcodeWhitelist() {
    //  //first_line = false;
    //}
    // assert(kmer.length() == (size_t)kmer_size_);
    uint64_t barcode_key = SequenceBatch::GenerateSeedFromSequence(
    uint64_t barcode_key = GenerateSeedFromSequence(
        barcode.data(), barcode_length, 0, barcode_length);
    // PoreModelParameters &pore_model_parameters =
    // pore_models_[kmer_hash_value]; barcode_whitelist_file_line_string_stream
+1 −1
Original line number Diff line number Diff line
@@ -27,7 +27,7 @@
#include "temp_mapping.h"
#include "utils.h"

#define CHROMAP_VERSION "0.2.3-r407"
#define CHROMAP_VERSION "0.2.3-r408"

namespace chromap {

+3 −37
Original line number Diff line number Diff line
@@ -164,43 +164,9 @@ class SequenceBatch {
                                             uint32_t start_position,
                                             uint32_t seed_length) const {
    const char *sequence = GetSequenceAt(sequence_index);
    uint32_t sequence_length = GetSequenceLengthAt(sequence_index);
    uint64_t mask = (((uint64_t)1) << (2 * seed_length)) - 1;
    uint64_t seed = 0;
    for (uint32_t i = 0; i < seed_length; ++i) {
      if (start_position + i < sequence_length) {
        uint8_t current_base = CharToUint8(sequence[i + start_position]);
        if (current_base < 4) {                        // not an ambiguous base
          seed = ((seed << 2) | current_base) & mask;  // forward k-mer
        } else {
          seed = (seed << 2) & mask;  // N->A
        }
      } else {
        seed = (seed << 2) & mask;  // Pad A
      }
    }
    return seed;
  }

  inline static uint64_t GenerateSeedFromSequence(const char *sequence,
                                                  uint32_t sequence_length,
                                                  uint32_t start_position,
                                                  uint32_t seed_length) {
    uint64_t mask = (((uint64_t)1) << (2 * seed_length)) - 1;
    uint64_t seed = 0;
    for (uint32_t i = 0; i < seed_length; ++i) {
      if (start_position + i < sequence_length) {
        uint8_t current_base = CharToUint8(sequence[i + start_position]);
        if (current_base < 4) {                        // not an ambiguous base
          seed = ((seed << 2) | current_base) & mask;  // forward k-mer
        } else {
          seed = (seed << 2) & mask;  // N->A
        }
      } else {
        seed = (seed << 2) & mask;  // Pad A
      }
    }
    return seed;
    const uint32_t sequence_length = GetSequenceLengthAt(sequence_index);
    return GenerateSeedFromSequence(sequence, sequence_length, start_position,
                                    seed_length);
  }

  inline void ReorderSequences(const std::vector<int> &rid_rank) {
+21 −0
Original line number Diff line number Diff line
@@ -109,6 +109,27 @@ inline static char Uint8ToChar(const uint8_t i) {
  return uint8_to_char_table_[i];
}

// Make sure the length is not greater than 32 before calling this function.
inline static uint64_t GenerateSeedFromSequence(const char *sequence,
                                                uint32_t sequence_length,
                                                uint32_t start_position,
                                                uint32_t seed_length) {
  uint64_t seed = 0;
  for (uint32_t i = 0; i < seed_length; ++i) {
    if (start_position + i < sequence_length) {
      uint8_t current_base = CharToUint8(sequence[i + start_position]);
      if (current_base < 4) {               // not an ambiguous base
        seed = (seed << 2) | current_base;  // forward k-mer
      } else {
        seed = seed << 2;  // N->A
      }
    } else {
      seed = seed << 2;  // Pad A
    }
  }
  return seed;
}

}  // namespace chromap

#endif  // UTILS_H_