Commit da19c8cb authored by Haowen Zhang's avatar Haowen Zhang
Browse files

Require effective range as SequenceBatch parameter.

Make default range as full range.
Rename format parsing function name.
parent 97b0b0d2
Loading
Loading
Loading
Loading
+18 −14
Original line number Diff line number Diff line
@@ -299,9 +299,7 @@ uint32_t Chromap::SampleInputBarcodesAndExamineLength() {
  }

  uint32_t sample_batch_size = 1000;
  SequenceBatch barcode_batch(sample_batch_size);

  barcode_batch.SetSeqEffectiveRange(barcode_effective_range_);
  SequenceBatch barcode_batch(sample_batch_size, barcode_effective_range_);

  barcode_batch.InitializeLoading(mapping_parameters_.barcode_file_paths[0]);

@@ -381,8 +379,7 @@ void Chromap::LoadBarcodeWhitelist() {

void Chromap::ComputeBarcodeAbundance(uint64_t max_num_sample_barcodes) {
  double real_start_time = GetRealTime();
  SequenceBatch barcode_batch(read_batch_size_);
  barcode_batch.SetSeqEffectiveRange(barcode_effective_range_);
  SequenceBatch barcode_batch(read_batch_size_, barcode_effective_range_);
  for (size_t read_file_index = 0;
       read_file_index < mapping_parameters_.read_file1_paths.size();
       ++read_file_index) {
@@ -714,9 +711,9 @@ void Chromap::OutputMappingStatistics() {
}

void Chromap::ParseReadFormat(const std::string &read_format) {
  read1_effective_range_.Init();
  read2_effective_range_.Init();
  barcode_effective_range_.Init();
  read1_effective_range_.InitializeParsing();
  read2_effective_range_.InitializeParsing();
  barcode_effective_range_.InitializeParsing();

  uint32_t i, j;
  for (i = 0; i < read_format.size();) {
@@ -724,13 +721,16 @@ void Chromap::ParseReadFormat(const std::string &read_format) {
      ;
    bool parse_success = true;
    if (read_format[i] == 'r' && read_format[i + 1] == '1') {
      parse_success = read1_effective_range_.ParseEffectiveRange(
      parse_success =
          read1_effective_range_.ParseFormatStringAndAppendEffectiveRange(
              read_format.c_str() + i, j - i);
    } else if (read_format[i] == 'r' && read_format[i + 1] == '2') {
      parse_success = read2_effective_range_.ParseEffectiveRange(
      parse_success =
          read2_effective_range_.ParseFormatStringAndAppendEffectiveRange(
              read_format.c_str() + i, j - i);
    } else if (read_format[i] == 'b' && read_format[i + 1] == 'c') {
      parse_success = barcode_effective_range_.ParseEffectiveRange(
      parse_success =
          barcode_effective_range_.ParseFormatStringAndAppendEffectiveRange(
              read_format.c_str() + i, j - i);
    } else {
      parse_success = false;
@@ -742,6 +742,10 @@ void Chromap::ParseReadFormat(const std::string &read_format) {

    i = j + 1;
  }

  read1_effective_range_.FinalizeParsing();
  read2_effective_range_.FinalizeParsing();
  barcode_effective_range_.FinalizeParsing();
}

void Chromap::GenerateCustomRidRanks(
+16 −21
Original line number Diff line number Diff line
@@ -23,6 +23,7 @@
#include "mmcache.hpp"
#include "paired_end_mapping_metadata.h"
#include "sequence_batch.h"
#include "sequence_effective_range.h"
#include "temp_mapping.h"
#include "utils.h"

@@ -184,14 +185,12 @@ void Chromap::MapSingleEndReads() {
  int kmer_size = index.GetKmerSize();
  // index.Statistics(num_sequences, reference);

  SequenceBatch read_batch(read_batch_size_);
  SequenceBatch read_batch_for_loading(read_batch_size_);
  SequenceBatch barcode_batch(read_batch_size_);
  SequenceBatch barcode_batch_for_loading(read_batch_size_);
  read_batch.SetSeqEffectiveRange(read1_effective_range_);
  read_batch_for_loading.SetSeqEffectiveRange(read1_effective_range_);
  barcode_batch.SetSeqEffectiveRange(barcode_effective_range_);
  barcode_batch_for_loading.SetSeqEffectiveRange(barcode_effective_range_);
  SequenceBatch read_batch(read_batch_size_, read1_effective_range_);
  SequenceBatch read_batch_for_loading(read_batch_size_,
                                       read1_effective_range_);
  SequenceBatch barcode_batch(read_batch_size_, barcode_effective_range_);
  SequenceBatch barcode_batch_for_loading(read_batch_size_,
                                          barcode_effective_range_);

  std::vector<std::vector<MappingRecord>> mappings_on_diff_ref_seqs;
  mappings_on_diff_ref_seqs.reserve(num_reference_sequences);
@@ -552,19 +551,15 @@ void Chromap::MapPairedEndReads() {
  // index.Statistics(num_sequences, reference);

  // Initialize read batches
  SequenceBatch read_batch1(read_batch_size_);
  SequenceBatch read_batch2(read_batch_size_);
  SequenceBatch barcode_batch(read_batch_size_);
  SequenceBatch read_batch1_for_loading(read_batch_size_);
  SequenceBatch read_batch2_for_loading(read_batch_size_);
  SequenceBatch barcode_batch_for_loading(read_batch_size_);

  read_batch1.SetSeqEffectiveRange(read1_effective_range_);
  read_batch2.SetSeqEffectiveRange(read2_effective_range_);
  barcode_batch.SetSeqEffectiveRange(barcode_effective_range_);
  read_batch1_for_loading.SetSeqEffectiveRange(read1_effective_range_);
  read_batch2_for_loading.SetSeqEffectiveRange(read2_effective_range_);
  barcode_batch_for_loading.SetSeqEffectiveRange(barcode_effective_range_);
  SequenceBatch read_batch1(read_batch_size_, read1_effective_range_);
  SequenceBatch read_batch2(read_batch_size_, read2_effective_range_);
  SequenceBatch barcode_batch(read_batch_size_, barcode_effective_range_);
  SequenceBatch read_batch1_for_loading(read_batch_size_,
                                        read1_effective_range_);
  SequenceBatch read_batch2_for_loading(read_batch_size_,
                                        read2_effective_range_);
  SequenceBatch barcode_batch_for_loading(read_batch_size_,
                                          barcode_effective_range_);

  // Initialize cache
  mm_cache mm_to_candidates_cache(2000003);
+6 −9
Original line number Diff line number Diff line
@@ -17,18 +17,19 @@ namespace chromap {
class SequenceBatch {
 public:
  KSEQ_INIT(gzFile, gzread);
  SequenceBatch() { effective_range_.Init(); }
  SequenceBatch() = default;

  // Construct once and use update sequences when loading each batch.
  SequenceBatch(uint32_t max_num_sequences)
      : max_num_sequences_(max_num_sequences) {
  SequenceBatch(uint32_t max_num_sequences,
                const SequenceEffectiveRange &effective_range)
      : max_num_sequences_(max_num_sequences),
        effective_range_(effective_range) {
    sequence_batch_.reserve(max_num_sequences_);
    for (uint32_t i = 0; i < max_num_sequences_; ++i) {
      sequence_batch_.emplace_back((kseq_t *)calloc(1, sizeof(kseq_t)));
      sequence_batch_.back()->f = NULL;
    }
    negative_sequence_batch_.assign(max_num_sequences_, "");
    effective_range_.Init();
  }

  ~SequenceBatch() {
@@ -98,10 +99,6 @@ class SequenceBatch {
    }
  }

  inline void SetSeqEffectiveRange(const SequenceEffectiveRange &range) {
    effective_range_ = range;
  }

  //  inline char GetReverseComplementBaseOfSequenceAt(uint32_t sequence_index,
  //  uint32_t position) {
  //    kseq_t *sequence = sequence_batch_[sequence_index];
@@ -231,7 +228,7 @@ class SequenceBatch {
  std::vector<std::string> negative_sequence_batch_;

  // Actual range within each sequence.
  SequenceEffectiveRange effective_range_;
  const SequenceEffectiveRange effective_range_;
  void ReplaceByEffectiveRange(kstring_t &seq, bool is_seq);
};

+25 −34
Original line number Diff line number Diff line
@@ -11,33 +11,35 @@
namespace chromap {

// The class handles the custom read format indicating the effective range on a
// sequence.
// sequence. Default is the full range.
class SequenceEffectiveRange {
 public:
  SequenceEffectiveRange() {}
  ~SequenceEffectiveRange() {}
  SequenceEffectiveRange() = default;
  ~SequenceEffectiveRange() = default;

  void Init() {
    starts.push_back(0);
    ends.push_back(-1);
  void InitializeParsing() {
    starts.clear();
    ends.clear();
    strand = '+';
    default_range = true;
  }

  void FinalizeParsing() {
    std::sort(starts.begin(), starts.end());
    std::sort(ends.begin(), ends.end());

    if (ends[0] == -1) {
      ends.erase(ends.begin());
      ends.push_back(-1);
    }
  }

  // Return false if it fails to parse the format string.
  bool ParseEffectiveRange(const char *s, int len) {
  bool ParseFormatStringAndAppendEffectiveRange(const char *s, int len) {
    int i;
    int j = 0;  // start, end, strand section
    char buffer[20];
    int blen = 0;

    if (default_range) {
      starts.clear();
      ends.clear();
      strand = '+';
      default_range = false;
    }

    for (i = 3; i <= len; ++i) {
      if (i == len || s[i] == ':') {
        buffer[blen] = '\0';
@@ -50,7 +52,9 @@ class SequenceEffectiveRange {
        }

        blen = 0;
        if (i < len && s[i] == ':') ++j;
        if (i < len && s[i] == ':') {
          ++j;
        }
      } else {
        buffer[blen] = s[i];
        ++blen;
@@ -61,23 +65,12 @@ class SequenceEffectiveRange {
      return false;
    }

    std::sort(starts.begin(), starts.end());
    std::sort(ends.begin(), ends.end());

    const int num_ranges = starts.size();
    if (ends[0] == -1) {
      for (i = 0; i < num_ranges - 1; ++i) {
        ends[i] = ends[i + 1];
      }
      ends[i] = -1;
    }

    return true;
  }

  // Replace by the range specified in the starts, ends section, but does not
  // apply the strand operation. Return new length.
  int Replace(char *s, int len, bool need_complement) {
  int Replace(char *s, int len, bool need_complement) const {
    if (IsFullRangeAndPositiveStrand()) {
      return len;
    }
@@ -118,7 +111,7 @@ class SequenceEffectiveRange {
  }

 private:
  bool IsFullRangeAndPositiveStrand() {
  bool IsFullRangeAndPositiveStrand() const {
    if (strand == '+' && starts[0] == 0 && ends[0] == -1) {
      return true;
    }
@@ -126,13 +119,11 @@ class SequenceEffectiveRange {
    return false;
  }

  std::vector<int> starts;
  std::vector<int> ends;
  std::vector<int> starts = {0};
  std::vector<int> ends = {-1};
  // Strand is either '+' or '-'. The barcode will be reverse-complemented after
  // extraction if strand is '-'.
  char strand;
  // Whether the range has been modified by new input.
  bool default_range;
  char strand = '+';
};

}  // namespace chromap