Commit da05c6e7 authored by Haowen Zhang's avatar Haowen Zhang
Browse files

Clean up the code of SequenceBatch.

Remove GetSequenceBatchSize() to prevent bugs in the future.
parent e47e3a1f
Loading
Loading
Loading
Loading
+15 −14
Original line number Original line Diff line number Diff line
@@ -516,12 +516,10 @@ void Chromap<MappingRecord>::UpdateBarcodeAbundance(


template <typename MappingRecord>
template <typename MappingRecord>
void Chromap<MappingRecord>::GenerateCustomizedRidRank(
void Chromap<MappingRecord>::GenerateCustomizedRidRank(
    const std::string &rid_order_path, const SequenceBatch &reference,
    const std::string &rid_order_path, uint32_t num_reference_sequences,
    std::vector<int> &rid_rank) {
    const SequenceBatch &reference, std::vector<int> &rid_rank) {
  int ref_size = reference.GetSequenceBatchSize();
  rid_rank.resize(num_reference_sequences);
  int i = 0;
  for (uint32_t i = 0; i < num_reference_sequences; ++i) {
  rid_rank.resize(ref_size);
  for (i = 0; i < ref_size; ++i) {
    rid_rank[i] = i;
    rid_rank[i] = i;
  }
  }


@@ -532,7 +530,7 @@ void Chromap<MappingRecord>::GenerateCustomizedRidRank(
  std::map<std::string, int> rname_to_rank;
  std::map<std::string, int> rname_to_rank;
  std::ifstream file_stream(rid_order_path);
  std::ifstream file_stream(rid_order_path);
  std::string line;
  std::string line;
  i = 0;
  uint32_t i = 0;
  while (getline(file_stream, line)) {
  while (getline(file_stream, line)) {
    rname_to_rank[line] = i;
    rname_to_rank[line] = i;
    i += 1;
    i += 1;
@@ -540,7 +538,7 @@ void Chromap<MappingRecord>::GenerateCustomizedRidRank(
  file_stream.close();
  file_stream.close();


  // First put the chrosomes in the list provided by user
  // First put the chrosomes in the list provided by user
  for (i = 0; i < ref_size; ++i) {
  for (uint32_t i = 0; i < num_reference_sequences; ++i) {
    std::string rname(reference.GetSequenceNameAt(i));
    std::string rname(reference.GetSequenceNameAt(i));
    if (rname_to_rank.find(rname) != rname_to_rank.end()) {
    if (rname_to_rank.find(rname) != rname_to_rank.end()) {
      rid_rank[i] = rname_to_rank[rname];
      rid_rank[i] = rname_to_rank[rname];
@@ -551,16 +549,16 @@ void Chromap<MappingRecord>::GenerateCustomizedRidRank(


  // we may have some rank without any rid associated with, this helps if
  // we may have some rank without any rid associated with, this helps if
  // cutstom list contains rid not in the reference`
  // cutstom list contains rid not in the reference`
  int k = rname_to_rank.size();
  uint32_t k = rname_to_rank.size();
  // Put the remaining chrosomes
  // Put the remaining chrosomes
  for (i = 0; i < ref_size; ++i) {
  for (uint32_t i = 0; i < num_reference_sequences; ++i) {
    if (rid_rank[i] == -1) {
    if (rid_rank[i] == -1) {
      rid_rank[i] = k;
      rid_rank[i] = k;
      ++k;
      ++k;
    }
    }
  }
  }


  if (k > ref_size) {
  if (k > num_reference_sequences) {
    ExitWithMessage("Unknown chromsome names found in chromosome order file");
    ExitWithMessage("Unknown chromsome names found in chromosome order file");
  }
  }


@@ -590,12 +588,14 @@ void Chromap<MappingRecord>::MapPairedEndReads() {
  uint32_t num_reference_sequences = reference.LoadAllSequences();
  uint32_t num_reference_sequences = reference.LoadAllSequences();
  if (mapping_parameters_.custom_rid_order_path.length() > 0) {
  if (mapping_parameters_.custom_rid_order_path.length() > 0) {
    GenerateCustomizedRidRank(mapping_parameters_.custom_rid_order_path,
    GenerateCustomizedRidRank(mapping_parameters_.custom_rid_order_path,
                              reference, custom_rid_rank_);
                              num_reference_sequences, reference,
                              custom_rid_rank_);
    reference.ReorderSequences(custom_rid_rank_);
    reference.ReorderSequences(custom_rid_rank_);
  }
  }
  if (mapping_parameters_.mapping_output_format == MAPPINGFORMAT_PAIRS) {
  if (mapping_parameters_.mapping_output_format == MAPPINGFORMAT_PAIRS) {
    GenerateCustomizedRidRank(mapping_parameters_.pairs_custom_rid_order_path,
    GenerateCustomizedRidRank(mapping_parameters_.pairs_custom_rid_order_path,
                              reference, pairs_custom_rid_rank_);
                              num_reference_sequences, reference,
                              pairs_custom_rid_rank_);
  }
  }


  // Load index
  // Load index
@@ -1164,7 +1164,8 @@ void Chromap<MappingRecord>::MapSingleEndReads() {
  uint32_t num_reference_sequences = reference.LoadAllSequences();
  uint32_t num_reference_sequences = reference.LoadAllSequences();
  if (mapping_parameters_.custom_rid_order_path.length() > 0) {
  if (mapping_parameters_.custom_rid_order_path.length() > 0) {
    GenerateCustomizedRidRank(mapping_parameters_.custom_rid_order_path,
    GenerateCustomizedRidRank(mapping_parameters_.custom_rid_order_path,
                              reference, custom_rid_rank_);
                              num_reference_sequences, reference,
                              custom_rid_rank_);
    reference.ReorderSequences(custom_rid_rank_);
    reference.ReorderSequences(custom_rid_rank_);
  }
  }


+1 −0
Original line number Original line Diff line number Diff line
@@ -110,6 +110,7 @@ class Chromap {
                                   &mappings_on_diff_ref_seqs);
                                   &mappings_on_diff_ref_seqs);


  void GenerateCustomizedRidRank(const std::string &rid_order_path,
  void GenerateCustomizedRidRank(const std::string &rid_order_path,
                                 uint32_t num_reference_sequences,
                                 const SequenceBatch &reference,
                                 const SequenceBatch &reference,
                                 std::vector<int> &rid_rank);
                                 std::vector<int> &rid_rank);


+29 −2
Original line number Original line Diff line number Diff line
@@ -6,12 +6,12 @@
#include "utils.h"
#include "utils.h"


namespace chromap {
namespace chromap {

constexpr uint8_t SequenceBatch::char_to_uint8_table_[256];
constexpr uint8_t SequenceBatch::char_to_uint8_table_[256];
constexpr char SequenceBatch::uint8_to_char_table_[8];
constexpr char SequenceBatch::uint8_to_char_table_[8];


void SequenceBatch::InitializeLoading(const std::string &sequence_file_path) {
void SequenceBatch::InitializeLoading(const std::string &sequence_file_path) {
  sequence_file_path_ = sequence_file_path;
  sequence_file_ = gzopen(sequence_file_path.c_str(), "r");
  sequence_file_ = gzopen(sequence_file_path_.c_str(), "r");
  if (sequence_file_ == NULL) {
  if (sequence_file_ == NULL) {
    ExitWithMessage("Cannot find sequence file" + sequence_file_path);
    ExitWithMessage("Cannot find sequence file" + sequence_file_path);
  }
  }
@@ -137,4 +137,31 @@ void SequenceBatch::FinalizeLoading() {
  kseq_destroy(sequence_kseq_);
  kseq_destroy(sequence_kseq_);
  gzclose(sequence_file_);
  gzclose(sequence_file_);
}
}

void SequenceBatch::ReplaceByEffectiveRange(kstring_t &seq) {
  if (effective_range_[0] == 0 && effective_range_[1] == -1 &&
      effective_range_[2] == 1) {
    return;
  }
  int i, j;
  int start = effective_range_[0];
  int end = effective_range_[1];
  if (effective_range_[1] == -1) end = seq.l - 1;
  for (i = 0; i < end - start + 1; ++i) {
    seq.s[i] = seq.s[start + i];
  }
  seq.s[i] = '\0';
  seq.l = end - start + 1;
  if (effective_range_[2] == -1) {
    for (i = 0; i < (int)seq.l; ++i) {
      seq.s[i] = Uint8ToChar(((uint8_t)3) ^ (CharToUint8(seq.s[i])));
    }
    for (i = 0, j = seq.l - 1; i < j; ++i, --j) {
      char tmp = seq.s[i];
      seq.s[i] = seq.s[j];
      seq.s[j] = tmp;
    }
  }
}

}  // namespace chromap
}  // namespace chromap
+36 −35
Original line number Original line Diff line number Diff line
@@ -11,6 +11,7 @@
#include "kseq.h"
#include "kseq.h"


namespace chromap {
namespace chromap {

class SequenceBatch {
class SequenceBatch {
 public:
 public:
  KSEQ_INIT(gzFile, gzread);
  KSEQ_INIT(gzFile, gzread);
@@ -19,9 +20,10 @@ class SequenceBatch {
    effective_range_[1] = -1;
    effective_range_[1] = -1;
    effective_range_[2] = 1;
    effective_range_[2] = 1;
  }
  }

  // Construct once and use update sequences when loading each batch.
  SequenceBatch(uint32_t max_num_sequences)
  SequenceBatch(uint32_t max_num_sequences)
      : max_num_sequences_(max_num_sequences) {
      : max_num_sequences_(max_num_sequences) {
    // Construct once and use update methods when loading each batch
    sequence_batch_.reserve(max_num_sequences_);
    sequence_batch_.reserve(max_num_sequences_);
    for (uint32_t i = 0; i < max_num_sequences_; ++i) {
    for (uint32_t i = 0; i < max_num_sequences_; ++i) {
      sequence_batch_.emplace_back((kseq_t *)calloc(1, sizeof(kseq_t)));
      sequence_batch_.emplace_back((kseq_t *)calloc(1, sizeof(kseq_t)));
@@ -32,6 +34,7 @@ class SequenceBatch {
    effective_range_[1] = -1;
    effective_range_[1] = -1;
    effective_range_[2] = 1;
    effective_range_[2] = 1;
  }
  }

  ~SequenceBatch() {
  ~SequenceBatch() {
    if (sequence_batch_.size() > 0) {
    if (sequence_batch_.size() > 0) {
      for (uint32_t i = 0; i < sequence_batch_.size(); ++i) {
      for (uint32_t i = 0; i < sequence_batch_.size(); ++i) {
@@ -39,46 +42,58 @@ class SequenceBatch {
      }
      }
    }
    }
  }
  }

  inline uint32_t GetMaxBatchSize() const { return max_num_sequences_; }
  inline uint32_t GetMaxBatchSize() const { return max_num_sequences_; }

  inline uint64_t GetNumBases() const { return num_bases_; }
  inline uint64_t GetNumBases() const { return num_bases_; }

  inline std::vector<kseq_t *> &GetSequenceBatch() { return sequence_batch_; }
  inline std::vector<kseq_t *> &GetSequenceBatch() { return sequence_batch_; }

  inline std::vector<std::string> &GetNegativeSequenceBatch() {
  inline std::vector<std::string> &GetNegativeSequenceBatch() {
    return negative_sequence_batch_;
    return negative_sequence_batch_;
  }
  }

  inline const char *GetSequenceAt(uint32_t sequence_index) const {
  inline const char *GetSequenceAt(uint32_t sequence_index) const {
    return sequence_batch_[sequence_index]->seq.s;
    return sequence_batch_[sequence_index]->seq.s;
  }
  }

  inline uint32_t GetSequenceLengthAt(uint32_t sequence_index) const {
  inline uint32_t GetSequenceLengthAt(uint32_t sequence_index) const {
    return sequence_batch_[sequence_index]->seq.l;
    return sequence_batch_[sequence_index]->seq.l;
  }
  }

  inline const char *GetSequenceNameAt(uint32_t sequence_index) const {
  inline const char *GetSequenceNameAt(uint32_t sequence_index) const {
    return sequence_batch_[sequence_index]->name.s;
    return sequence_batch_[sequence_index]->name.s;
  }
  }

  inline uint32_t GetSequenceNameLengthAt(uint32_t sequence_index) const {
  inline uint32_t GetSequenceNameLengthAt(uint32_t sequence_index) const {
    return sequence_batch_[sequence_index]->name.l;
    return sequence_batch_[sequence_index]->name.l;
  }
  }

  inline const char *GetSequenceQualAt(uint32_t sequence_index) const {
  inline const char *GetSequenceQualAt(uint32_t sequence_index) const {
    return sequence_batch_[sequence_index]->qual.s;
    return sequence_batch_[sequence_index]->qual.s;
  }
  }
  inline uint32_t GetSequenceIdAt(uint32_t sequence_index) const {
  inline uint32_t GetSequenceIdAt(uint32_t sequence_index) const {
    return sequence_batch_[sequence_index]->id;
    return sequence_batch_[sequence_index]->id;
  }
  }

  inline const std::string &GetNegativeSequenceAt(
  inline const std::string &GetNegativeSequenceAt(
      uint32_t sequence_index) const {
      uint32_t sequence_index) const {
    return negative_sequence_batch_[sequence_index];
    return negative_sequence_batch_[sequence_index];
  }
  }
  inline int GetSequenceBatchSize() const { return sequence_batch_.size(); }

  inline void SetSeqEffectiveRange(int start, int end, int strand) {
  inline void SetSeqEffectiveRange(int start, int end, int strand) {
    effective_range_[0] = start;
    effective_range_[0] = start;
    effective_range_[1] = end;
    effective_range_[1] = end;
    effective_range_[2] = strand;
    effective_range_[2] = strand;
  }
  }

  //  inline char GetReverseComplementBaseOfSequenceAt(uint32_t sequence_index,
  //  inline char GetReverseComplementBaseOfSequenceAt(uint32_t sequence_index,
  //  uint32_t position) {
  //  uint32_t position) {
  //    kseq_t *sequence = sequence_batch_[sequence_index];
  //    kseq_t *sequence = sequence_batch_[sequence_index];
  //    return Uint8ToChar(((uint8_t)3) ^
  //    return Uint8ToChar(((uint8_t)3) ^
  //    (CharToUint8((sequence->seq.s)[sequence->seq.l - position - 1])));
  //    (CharToUint8((sequence->seq.s)[sequence->seq.l - position - 1])));
  //  }
  //  }

  inline void PrepareNegativeSequenceAt(uint32_t sequence_index) {
  inline void PrepareNegativeSequenceAt(uint32_t sequence_index) {
    kseq_t *sequence = sequence_batch_[sequence_index];
    kseq_t *sequence = sequence_batch_[sequence_index];
    uint32_t sequence_length = sequence->seq.l;
    uint32_t sequence_length = sequence->seq.l;
@@ -91,6 +106,7 @@ class SequenceBatch {
          (CharToUint8((sequence->seq.s)[sequence_length - i - 1]))));
          (CharToUint8((sequence->seq.s)[sequence_length - i - 1]))));
    }
    }
  }
  }

  inline void TrimSequenceAt(uint32_t sequence_index, int length_after_trim) {
  inline void TrimSequenceAt(uint32_t sequence_index, int length_after_trim) {
    kseq_t *sequence = sequence_batch_[sequence_index];
    kseq_t *sequence = sequence_batch_[sequence_index];
    negative_sequence_batch_[sequence_index].erase(
    negative_sequence_batch_[sequence_index].erase(
@@ -102,17 +118,24 @@ class SequenceBatch {
    sequence->qual.l = length_after_trim;
    sequence->qual.l = length_after_trim;
    sequence->qual.s[sequence->qual.l] = '\0';
    sequence->qual.s[sequence->qual.l] = '\0';
  }
  }

  inline void SwapSequenceBatch(SequenceBatch &batch) {
  inline void SwapSequenceBatch(SequenceBatch &batch) {
    sequence_batch_.swap(batch.GetSequenceBatch());
    sequence_batch_.swap(batch.GetSequenceBatch());
    negative_sequence_batch_.swap(batch.GetNegativeSequenceBatch());
    negative_sequence_batch_.swap(batch.GetNegativeSequenceBatch());
  }
  }

  void InitializeLoading(const std::string &sequence_file_path);
  void InitializeLoading(const std::string &sequence_file_path);

  void FinalizeLoading();
  void FinalizeLoading();

  // Return the number of reads loaded into the batch
  // Return the number of reads loaded into the batch
  // and return 0 if there is no more reads
  // and return 0 if there is no more reads
  uint32_t LoadBatch();
  uint32_t LoadBatch();

  bool LoadOneSequenceAndSaveAt(uint32_t sequence_index);
  bool LoadOneSequenceAndSaveAt(uint32_t sequence_index);

  uint32_t LoadAllSequences();
  uint32_t LoadAllSequences();

  inline void CorrectBaseAt(uint32_t sequence_index, uint32_t base_position,
  inline void CorrectBaseAt(uint32_t sequence_index, uint32_t base_position,
                            char correct_base) {
                            char correct_base) {
    kseq_t *sequence = sequence_batch_[sequence_index];
    kseq_t *sequence = sequence_batch_[sequence_index];
@@ -122,6 +145,7 @@ class SequenceBatch {
  inline static uint8_t CharToUint8(const char c) {
  inline static uint8_t CharToUint8(const char c) {
    return char_to_uint8_table_[(uint8_t)c];
    return char_to_uint8_table_[(uint8_t)c];
  }
  }

  inline static char Uint8ToChar(const uint8_t i) {
  inline static char Uint8ToChar(const uint8_t i) {
    return uint8_to_char_table_[i];
    return uint8_to_char_table_[i];
  }
  }
@@ -175,28 +199,29 @@ class SequenceBatch {
    std::vector<kseq_t *> tmp_sequence_batch_ = sequence_batch_;
    std::vector<kseq_t *> tmp_sequence_batch_ = sequence_batch_;
    std::vector<std::string> tmp_negative_sequence_batch_ =
    std::vector<std::string> tmp_negative_sequence_batch_ =
        negative_sequence_batch_;
        negative_sequence_batch_;
    int i;
    for (size_t i = 0; i < sequence_batch_.size(); ++i) {
    int sequence_size = sequence_batch_.size();
    for (i = 0; i < sequence_size; ++i) {
      sequence_batch_[rid_rank[i]] = tmp_sequence_batch_[i];
      sequence_batch_[rid_rank[i]] = tmp_sequence_batch_[i];
    }
    }

    if (negative_sequence_batch_.size() > 0) {
    if (negative_sequence_batch_.size() > 0) {
      for (i = 0; i < sequence_size; ++i) {
      for (size_t i = 0; i < sequence_batch_.size(); ++i) {
        negative_sequence_batch_[rid_rank[i]] = tmp_negative_sequence_batch_[i];
        negative_sequence_batch_[rid_rank[i]] = tmp_negative_sequence_batch_[i];
      }
      }
    }
    }
  }
  }


 protected:
 protected:
  void ReplaceByEffectiveRange(kstring_t &seq);

  uint32_t num_loaded_sequences_ = 0;
  uint32_t num_loaded_sequences_ = 0;
  uint32_t max_num_sequences_;
  uint32_t max_num_sequences_ = 0;
  uint64_t num_bases_;
  uint64_t num_bases_ = 0;
  std::string sequence_file_path_;
  gzFile sequence_file_;
  gzFile sequence_file_;
  kseq_t *sequence_kseq_;
  kseq_t *sequence_kseq_ = nullptr;
  std::vector<kseq_t *> sequence_batch_;
  std::vector<kseq_t *> sequence_batch_;
  std::vector<std::string> negative_sequence_batch_;
  std::vector<std::string> negative_sequence_batch_;
  int effective_range_[3] = {0, -1, 1};  // actual range within each sequence.
  int effective_range_[3] = {0, -1, 1};  // actual range within each sequence.

  static constexpr uint8_t char_to_uint8_table_[256] = {
  static constexpr uint8_t char_to_uint8_table_[256] = {
      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
@@ -211,32 +236,8 @@ class SequenceBatch {
      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
  static constexpr char uint8_to_char_table_[8] = {'A', 'C', 'G', 'T',
  static constexpr char uint8_to_char_table_[8] = {'A', 'C', 'G', 'T',
                                                   'N', 'N', 'N', 'N'};
                                                   'N', 'N', 'N', 'N'};
  void ReplaceByEffectiveRange(kstring_t &seq) {
    if (effective_range_[0] == 0 && effective_range_[1] == -1 &&
        effective_range_[2] == 1) {
      return;
    }
    int i, j;
    int start = effective_range_[0];
    int end = effective_range_[1];
    if (effective_range_[1] == -1) end = seq.l - 1;
    for (i = 0; i < end - start + 1; ++i) {
      seq.s[i] = seq.s[start + i];
    }
    seq.s[i] = '\0';
    seq.l = end - start + 1;
    if (effective_range_[2] == -1) {
      for (i = 0; i < (int)seq.l; ++i) {
        seq.s[i] = Uint8ToChar(((uint8_t)3) ^ (CharToUint8(seq.s[i])));
      }
      for (i = 0, j = seq.l - 1; i < j; ++i, --j) {
        char tmp = seq.s[i];
        seq.s[i] = seq.s[j];
        seq.s[j] = tmp;
      }
    }
  }
};
};

}  // namespace chromap
}  // namespace chromap


#endif  // SEQUENCEBATCH_H_
#endif  // SEQUENCEBATCH_H_