Commit b3fad740 authored by Swift Genomics's avatar Swift Genomics Committed by swiftgenomics
Browse files

Refactor SequenceBatch.

parent 1fdd749e
Loading
Loading
Loading
Loading
+2 −1
Original line number Original line Diff line number Diff line
@@ -19,7 +19,8 @@ void Chromap::ConstructIndex() {
  // Load all sequences in the reference into one batch
  // Load all sequences in the reference into one batch
  SequenceBatch reference;
  SequenceBatch reference;
  reference.InitializeLoading(index_parameters_.reference_file_path);
  reference.InitializeLoading(index_parameters_.reference_file_path);
  uint32_t num_sequences = reference.LoadAllSequences();
  reference.LoadAllSequences();
  const uint32_t num_sequences = reference.GetNumSequences();
  Index index(index_parameters_);
  Index index(index_parameters_);
  index.Construct(num_sequences, reference);
  index.Construct(num_sequences, reference);
  index.Statistics(num_sequences, reference);
  index.Statistics(num_sequences, reference);
+5 −3
Original line number Original line Diff line number Diff line
@@ -29,7 +29,7 @@
#include "temp_mapping.h"
#include "temp_mapping.h"
#include "utils.h"
#include "utils.h"


#define CHROMAP_VERSION "0.2.3-r452"
#define CHROMAP_VERSION "0.2.3-r458"


namespace chromap {
namespace chromap {


@@ -174,7 +174,8 @@ void Chromap::MapSingleEndReads() {


  SequenceBatch reference;
  SequenceBatch reference;
  reference.InitializeLoading(mapping_parameters_.reference_file_path);
  reference.InitializeLoading(mapping_parameters_.reference_file_path);
  uint32_t num_reference_sequences = reference.LoadAllSequences();
  reference.LoadAllSequences();
  uint32_t num_reference_sequences = reference.GetNumSequences();
  if (mapping_parameters_.custom_rid_order_file_path.length() > 0) {
  if (mapping_parameters_.custom_rid_order_file_path.length() > 0) {
    GenerateCustomRidRanks(mapping_parameters_.custom_rid_order_file_path,
    GenerateCustomRidRanks(mapping_parameters_.custom_rid_order_file_path,
                           num_reference_sequences, reference,
                           num_reference_sequences, reference,
@@ -539,7 +540,8 @@ void Chromap::MapPairedEndReads() {
  // Load reference
  // Load reference
  SequenceBatch reference;
  SequenceBatch reference;
  reference.InitializeLoading(mapping_parameters_.reference_file_path);
  reference.InitializeLoading(mapping_parameters_.reference_file_path);
  uint32_t num_reference_sequences = reference.LoadAllSequences();
  reference.LoadAllSequences();
  uint32_t num_reference_sequences = reference.GetNumSequences();
  if (mapping_parameters_.custom_rid_order_file_path.length() > 0) {
  if (mapping_parameters_.custom_rid_order_file_path.length() > 0) {
    GenerateCustomRidRanks(mapping_parameters_.custom_rid_order_file_path,
    GenerateCustomRidRanks(mapping_parameters_.custom_rid_order_file_path,
                           num_reference_sequences, reference,
                           num_reference_sequences, reference,
+61 −76
Original line number Original line Diff line number Diff line
@@ -14,90 +14,81 @@ void SequenceBatch::InitializeLoading(const std::string &sequence_file_path) {
  sequence_kseq_ = kseq_init(sequence_file_);
  sequence_kseq_ = kseq_init(sequence_file_);
}
}


uint32_t SequenceBatch::LoadBatch() {
void SequenceBatch::FinalizeLoading() {
  double real_start_time = GetRealTime();
  kseq_destroy(sequence_kseq_);
  uint32_t num_sequences = 0;
  gzclose(sequence_file_);
  num_bases_ = 0;
}
  for (uint32_t sequence_index = 0; sequence_index < max_num_sequences_;

       ++sequence_index) {
bool SequenceBatch::LoadOneSequenceAndSaveAt(uint32_t sequence_index) {
  if (sequence_index == 0) {
    num_loaded_sequences_ = 0;
  }

  int length = kseq_read(sequence_kseq_);
  int length = kseq_read(sequence_kseq_);
    while (length == 0) {  // Skip the sequences of length 0
  while (length == 0) {
    length = kseq_read(sequence_kseq_);
    length = kseq_read(sequence_kseq_);
  }
  }

  if (length > 0) {
  if (length > 0) {
    kseq_t *sequence = sequence_batch_[sequence_index];
    kseq_t *sequence = sequence_batch_[sequence_index];
    std::swap(sequence_kseq_->seq, sequence->seq);
    std::swap(sequence_kseq_->seq, sequence->seq);
    ReplaceByEffectiveRange(sequence->seq, /*is_seq=*/true);
    ReplaceByEffectiveRange(sequence->seq, /*is_seq=*/true);
    std::swap(sequence_kseq_->name, sequence->name);
    std::swap(sequence_kseq_->name, sequence->name);
    std::swap(sequence_kseq_->comment, sequence->comment);
    std::swap(sequence_kseq_->comment, sequence->comment);
    sequence->id = total_num_loaded_sequences_;
    ++total_num_loaded_sequences_;

    if (sequence_index >= num_loaded_sequences_) {
      ++num_loaded_sequences_;
    } else if (sequence_index + 1 != num_loaded_sequences_) {
      std::cerr << sequence_index << " " << num_loaded_sequences_ << "\n";
      ExitWithMessage(
          "Shouldn't override other sequences rather than the last!");
    }

    if (sequence_kseq_->qual.l != 0) {  // fastq file
    if (sequence_kseq_->qual.l != 0) {  // fastq file
      std::swap(sequence_kseq_->qual, sequence->qual);
      std::swap(sequence_kseq_->qual, sequence->qual);
      ReplaceByEffectiveRange(sequence->qual, /*is_seq=*/false);
      ReplaceByEffectiveRange(sequence->qual, /*is_seq=*/false);
    }
    }
      sequence->id = num_loaded_sequences_;
    return false;
      ++num_loaded_sequences_;
  }
      ++num_sequences;

      num_bases_ += length;
  // Make sure to reach the end of the file rather than meet an error.
    } else {
  if (length != -1) {
  if (length != -1) {
    ExitWithMessage(
    ExitWithMessage(
        "Didn't reach the end of sequence file, which might be corrupted!");
        "Didn't reach the end of sequence file, which might be corrupted!");
  }
  }
      // make sure to reach the end of file rather than meet an error
  return true;
}

uint32_t SequenceBatch::LoadBatch() {
  double real_start_time = GetRealTime();
  num_loaded_sequences_ = 0;
  for (uint32_t sequence_index = 0; sequence_index < max_num_sequences_;
       ++sequence_index) {
    if (LoadOneSequenceAndSaveAt(sequence_index)) {
      break;
      break;
    }
    }
  }
  }
  if (num_sequences != 0) {

  if (num_loaded_sequences_ != 0) {
    std::cerr << "Loaded sequence batch successfully in "
    std::cerr << "Loaded sequence batch successfully in "
              << GetRealTime() - real_start_time << "s, ";
              << GetRealTime() - real_start_time << "s, ";
    std::cerr << "number of sequences: " << num_sequences << ", ";
    std::cerr << "number of sequences: " << num_loaded_sequences_ << ".\n";
    std::cerr << "number of bases: " << num_bases_ << ".\n";
  } else {
  } else {
    std::cerr << "No more sequences.\n";
    std::cerr << "No more sequences.\n";
  }
  }
  return num_sequences;
  return num_loaded_sequences_;
}
}


bool SequenceBatch::LoadOneSequenceAndSaveAt(uint32_t sequence_index) {
void SequenceBatch::LoadAllSequences() {
  // double real_start_time = Chromap::GetRealTime();
  bool no_more_sequence = false;
  int length = kseq_read(sequence_kseq_);
  while (length == 0) {  // Skip the sequences of length 0
    length = kseq_read(sequence_kseq_);
  }
  if (length > 0) {
    kseq_t *sequence = sequence_batch_[sequence_index];
    std::swap(sequence_kseq_->seq, sequence->seq);
    ReplaceByEffectiveRange(sequence->seq, /*is_seq=*/true);
    std::swap(sequence_kseq_->name, sequence->name);
    std::swap(sequence_kseq_->comment, sequence->comment);
    sequence->id = num_loaded_sequences_;
    ++num_loaded_sequences_;
    if (sequence_kseq_->qual.l != 0) {  // fastq file
      std::swap(sequence_kseq_->qual, sequence->qual);
      ReplaceByEffectiveRange(sequence->qual, /*is_seq=*/false);
    }
  } else {
    if (length != -1) {
      ExitWithMessage(
          "Didn't reach the end of sequence file, which might be corrupted!");
    }
    // make sure to reach the end of file rather than meet an error
    no_more_sequence = true;
  }
  return no_more_sequence;
}

uint32_t SequenceBatch::LoadAllSequences() {
  double real_start_time = GetRealTime();
  double real_start_time = GetRealTime();
  sequence_batch_.reserve(200);
  sequence_batch_.reserve(200);
  uint32_t num_sequences = 0;
  num_loaded_sequences_ = 0;
  num_bases_ = 0;
  num_bases_ = 0;
  int length = kseq_read(sequence_kseq_);
  int length = kseq_read(sequence_kseq_);
  while (length >= 0) {
  while (length >= 0) {
    if (length == 0) {  // Skip the sequences of length 0
    if (length > 0) {
      continue;
    } else if (length > 0) {
      sequence_batch_.emplace_back((kseq_t *)calloc(1, sizeof(kseq_t)));
      sequence_batch_.emplace_back((kseq_t *)calloc(1, sizeof(kseq_t)));
      kseq_t *sequence = sequence_batch_.back();
      kseq_t *sequence = sequence_batch_.back();
      std::swap(sequence_kseq_->seq, sequence->seq);
      std::swap(sequence_kseq_->seq, sequence->seq);
@@ -108,30 +99,24 @@ uint32_t SequenceBatch::LoadAllSequences() {
        std::swap(sequence_kseq_->qual, sequence->qual);
        std::swap(sequence_kseq_->qual, sequence->qual);
        ReplaceByEffectiveRange(sequence->qual, /*is_seq=*/false);
        ReplaceByEffectiveRange(sequence->qual, /*is_seq=*/false);
      }
      }
      sequence->id = num_loaded_sequences_;
      sequence->id = total_num_loaded_sequences_;
      ++total_num_loaded_sequences_;
      ++num_loaded_sequences_;
      ++num_loaded_sequences_;
      ++num_sequences;
      num_bases_ += length;
      num_bases_ += length;
    } else {
    }
    length = kseq_read(sequence_kseq_);
  }

  // Make sure to reach the end of the file rather than meet an error.
  if (length != -1) {
  if (length != -1) {
    ExitWithMessage(
    ExitWithMessage(
        "Didn't reach the end of sequence file, which might be corrupted!");
        "Didn't reach the end of sequence file, which might be corrupted!");
  }
  }
      // make sure to reach the end of file rather than meet an error

      break;
    }
    length = kseq_read(sequence_kseq_);
  }
  std::cerr << "Loaded all sequences successfully in "
  std::cerr << "Loaded all sequences successfully in "
            << GetRealTime() - real_start_time << "s, ";
            << GetRealTime() - real_start_time << "s, ";
  std::cerr << "number of sequences: " << num_sequences << ", ";
  std::cerr << "number of sequences: " << num_loaded_sequences_ << ", ";
  std::cerr << "number of bases: " << num_bases_ << ".\n";
  std::cerr << "number of bases: " << num_bases_ << ".\n";
  return num_sequences;
}

void SequenceBatch::FinalizeLoading() {
  kseq_destroy(sequence_kseq_);
  gzclose(sequence_file_);
}
}


void SequenceBatch::ReplaceByEffectiveRange(kstring_t &seq, bool is_seq) {
void SequenceBatch::ReplaceByEffectiveRange(kstring_t &seq, bool is_seq) {
+35 −8
Original line number Original line Diff line number Diff line
@@ -17,6 +17,9 @@ namespace chromap {
class SequenceBatch {
class SequenceBatch {
 public:
 public:
  KSEQ_INIT(gzFile, gzread);
  KSEQ_INIT(gzFile, gzread);

  // When 'max_num_sequences' is not specified. This batch can be used to load
  // any number of sequences with a positive full effective range.
  SequenceBatch() : effective_range_(SequenceEffectiveRange()) {}
  SequenceBatch() : effective_range_(SequenceEffectiveRange()) {}


  // Construct once and use update sequences when loading each batch.
  // Construct once and use update sequences when loading each batch.
@@ -40,6 +43,8 @@ class SequenceBatch {
    }
    }
  }
  }


  inline uint64_t GetNumSequences() const { return num_loaded_sequences_; }

  inline uint32_t GetMaxBatchSize() const { return max_num_sequences_; }
  inline uint32_t GetMaxBatchSize() const { return max_num_sequences_; }


  inline uint64_t GetNumBases() const { return num_bases_; }
  inline uint64_t GetNumBases() const { return num_bases_; }
@@ -121,7 +126,6 @@ class SequenceBatch {


  inline void TrimSequenceAt(uint32_t sequence_index, int length_after_trim) {
  inline void TrimSequenceAt(uint32_t sequence_index, int length_after_trim) {
    kseq_t *sequence = sequence_batch_[sequence_index];
    kseq_t *sequence = sequence_batch_[sequence_index];

    if (length_after_trim >= (int)sequence->seq.l) {
    if (length_after_trim >= (int)sequence->seq.l) {
      return;
      return;
    }
    }
@@ -146,13 +150,19 @@ class SequenceBatch {


  void FinalizeLoading();
  void FinalizeLoading();


  // Return the number of reads loaded into the batch
  // The func should never override other sequences rather than the last, which
  // and return 0 if there is no more reads
  // means 'sequence_index' cannot be smaller than 'num_loaded_sequences_' - 1.
  uint32_t LoadBatch();
  // Return true when reaching the end of the file.

  bool LoadOneSequenceAndSaveAt(uint32_t sequence_index);
  bool LoadOneSequenceAndSaveAt(uint32_t sequence_index);


  uint32_t LoadAllSequences();
  // Return the number of sequences loaded into the batch and return 0 if there
  // is no more sequences. This func now is only used to load barcodes.
  uint32_t LoadBatch();

  // Load all sequences in a file. This function should only be used to load
  // reference. And once the reference is loaded, the batch should never be
  // updated. This func is slow when there are large number of sequences.
  void LoadAllSequences();


  inline void CorrectBaseAt(uint32_t sequence_index, uint32_t base_position,
  inline void CorrectBaseAt(uint32_t sequence_index, uint32_t base_position,
                            char correct_base) {
                            char correct_base) {
@@ -185,17 +195,34 @@ class SequenceBatch {
  }
  }


 protected:
 protected:
  // When 'is_seq' is set to true, this func will complement the base when
  // necessary. Otherwise, it will just reverse the sequence.
  void ReplaceByEffectiveRange(kstring_t &seq, bool is_seq);

  // This is the accumulated number of sequences that have ever been loaded into
  // the batch. It is useful for tracking read ids.
  uint32_t total_num_loaded_sequences_ = 0;

  // This is the number of sequences loaded into the current batch.
  uint32_t num_loaded_sequences_ = 0;
  uint32_t num_loaded_sequences_ = 0;
  uint32_t max_num_sequences_ = 0;

  // This is the number of bases loaded into the current batch. It is only
  // populated for the reference.
  uint64_t num_bases_ = 0;
  uint64_t num_bases_ = 0;

  // This is the max number of sequences that can be loaded into the batch. It
  // is set to 0 when there is no such restriction.
  uint32_t max_num_sequences_ = 0;

  gzFile sequence_file_;
  gzFile sequence_file_;
  kseq_t *sequence_kseq_ = nullptr;
  kseq_t *sequence_kseq_ = nullptr;
  std::vector<kseq_t *> sequence_batch_;
  std::vector<kseq_t *> sequence_batch_;

  // TODO: avoid constructing the negative sequence batch.
  std::vector<std::string> negative_sequence_batch_;
  std::vector<std::string> negative_sequence_batch_;


  // Actual range within each sequence.
  // Actual range within each sequence.
  const SequenceEffectiveRange effective_range_;
  const SequenceEffectiveRange effective_range_;
  void ReplaceByEffectiveRange(kstring_t &seq, bool is_seq);
};
};


}  // namespace chromap
}  // namespace chromap