Commit 7556c47b authored by Li's avatar Li Committed by Haowen Zhang
Browse files

Allow the specification of non-consecutive sequence range usage through --read-format

parent 148082f7
Loading
Loading
Loading
Loading
+23 −79
Original line number Diff line number Diff line
@@ -301,8 +301,7 @@ uint32_t Chromap::SampleInputBarcodesAndExamineLength() {
  uint32_t sample_batch_size = 1000;
  SequenceBatch barcode_batch(sample_batch_size);

  barcode_batch.SetSeqEffectiveRange(barcode_format_[0], barcode_format_[1],
                                     barcode_format_[2]);
  barcode_batch.SetSeqEffectiveRange(barcode_format_);

  barcode_batch.InitializeLoading(mapping_parameters_.barcode_file_paths[0]);

@@ -383,8 +382,7 @@ void Chromap::LoadBarcodeWhitelist() {
void Chromap::ComputeBarcodeAbundance(uint64_t max_num_sample_barcodes) {
  double real_start_time = GetRealTime();
  SequenceBatch barcode_batch(read_batch_size_);
  barcode_batch.SetSeqEffectiveRange(barcode_format_[0], barcode_format_[1],
                                     barcode_format_[2]);
  barcode_batch.SetSeqEffectiveRange(barcode_format_);
  for (size_t read_file_index = 0;
       read_file_index < mapping_parameters_.read_file1_paths.size();
       ++read_file_index) {
@@ -717,84 +715,30 @@ void Chromap::OutputMappingStatistics() {
}

void Chromap::ParseReadFormat(const std::string &read_format) {
  uint32_t i;
  int j = 0;
  int k = 0;  // for read1, read2, or barcode
  read1_format_[0] = 0;
  read1_format_[1] = -1;
  read1_format_[2] = 1;
  read2_format_[0] = 0;
  read2_format_[1] = -1;
  read2_format_[2] = 1;
  barcode_format_[0] = 0;
  barcode_format_[1] = -1;
  barcode_format_[2] = 1;
  int fields[3] = {0, -1, 1};
  char buffer[20];
  int blen = 0;
  for (i = 0; i < read_format.size(); ++i) {
    if (read_format[i] == ',' || i == 0) {
      if (i > 0) {
        buffer[blen] = '\0';
        if (j <= 1) {
          fields[j] = atoi(buffer);
  uint32_t i, j;
  read1_format_.Init();
  read2_format_.Init();
  barcode_format_.Init();
  for (i = 0; i < read_format.size(); ) {
    for (j = i + 1; j < read_format.size() && j != ','; ++j) 
      ;
    bool parse_success = true;
    if (read_format[i] == 'r' && read_format[i + 1] == '1') {
      parse_success = read1_format_.ParseEffectiveRange(
                read_format.c_str() + i, j - i);
    } else if (read_format[i] == 'r' && read_format[i + 1] == '2') {
      parse_success = read2_format_.ParseEffectiveRange(
                read_format.c_str() + i, j - i);
    } else if (read_format[i] == 'b' && read_format[i + 1] == 'c') {
      parse_success = barcode_format_.ParseEffectiveRange(
                read_format.c_str() + i, j - i);
    } else {
          fields[j] = buffer[0] == '+' ? 1 : -1;
        }
        if (k == 0)
          memcpy(read1_format_, fields, sizeof(fields));
        else if (k == 1)
          memcpy(read2_format_, fields, sizeof(fields));
        else
          memcpy(barcode_format_, fields, sizeof(fields));

        ++i;
      }
      if (read_format[i] == 'r' && read_format[i + 1] == '1')
        k = 0;
      else if (read_format[i] == 'r' && read_format[i + 1] == '2')
        k = 1;
      else if (read_format[i] == 'b' && read_format[i + 1] == 'c')
        k = 2;
      else
        ExitWithMessage("Unknown read format: " + read_format + "\n");
      j = 0;
      fields[0] = fields[1] = 0;
      i += 2;
      blen = 0;
    } else {
      if (read_format[i] != ':') {
        if (j < 3) {
          buffer[blen] = read_format[i];
          ++blen;
        } else {
          ExitWithMessage("Unknown read format: " + read_format + "\n");
        }
      } else {
        buffer[blen] = '\0';
        if (j <= 1) {
          fields[j] = atoi(buffer);
        } else {
          fields[j] = buffer[0] == '+' ? 1 : -1;
        }
        ++j;
        blen = 0;
      }
      parse_success = false;
    }
    if (!parse_success) 
      ExitWithMessage("Unknown read format: " + read_format + "\n");
    i = j;
  }
  buffer[blen] = '\0';
  if (j <= 1) {
    fields[j] = atoi(buffer);
  } else {
    fields[j] = buffer[0] == '+' ? 1 : -1;
  }
  // By initialization, it is fine even if there is no read_format specified
  if (k == 0)
    memcpy(read1_format_, fields, sizeof(fields));
  else if (k == 1)
    memcpy(read2_format_, fields, sizeof(fields));
  else
    memcpy(barcode_format_, fields, sizeof(fields));
}

void Chromap::GenerateCustomRidRanks(
+13 −20
Original line number Diff line number Diff line
@@ -132,9 +132,9 @@ class Chromap {
  const uint32_t read_batch_size_ = 500000;

  // 0-start, 1-end (includsive), 2-strand(-1:minus, 1:plus)
  int barcode_format_[3];
  int read1_format_[3];
  int read2_format_[3];
  SequenceEffectiveRange barcode_format_;
  SequenceEffectiveRange read1_format_;
  SequenceEffectiveRange read2_format_;

  std::vector<int> custom_rid_rank_;
  std::vector<int> pairs_custom_rid_rank_;
@@ -188,13 +188,10 @@ void Chromap::MapSingleEndReads() {
  SequenceBatch read_batch_for_loading(read_batch_size_);
  SequenceBatch barcode_batch(read_batch_size_);
  SequenceBatch barcode_batch_for_loading(read_batch_size_);
  read_batch.SetSeqEffectiveRange(read1_format_[0], read1_format_[1], 1);
  read_batch_for_loading.SetSeqEffectiveRange(read1_format_[0],
                                              read1_format_[1], 1);
  barcode_batch.SetSeqEffectiveRange(barcode_format_[0], barcode_format_[1],
                                     barcode_format_[2]);
  barcode_batch_for_loading.SetSeqEffectiveRange(
      barcode_format_[0], barcode_format_[1], barcode_format_[2]);
  read_batch.SetSeqEffectiveRange(read1_format_);
  read_batch_for_loading.SetSeqEffectiveRange(read1_format_);
  barcode_batch.SetSeqEffectiveRange(barcode_format_);
  barcode_batch_for_loading.SetSeqEffectiveRange(barcode_format_);

  std::vector<std::vector<MappingRecord>> mappings_on_diff_ref_seqs;
  mappings_on_diff_ref_seqs.reserve(num_reference_sequences);
@@ -561,16 +558,12 @@ void Chromap::MapPairedEndReads() {
  SequenceBatch read_batch2_for_loading(read_batch_size_);
  SequenceBatch barcode_batch_for_loading(read_batch_size_);

  read_batch1.SetSeqEffectiveRange(read1_format_[0], read1_format_[1], 1);
  read_batch2.SetSeqEffectiveRange(read2_format_[0], read2_format_[1], 1);
  barcode_batch.SetSeqEffectiveRange(barcode_format_[0], barcode_format_[1],
                                     barcode_format_[2]);
  read_batch1_for_loading.SetSeqEffectiveRange(read1_format_[0],
                                               read1_format_[1], 1);
  read_batch2_for_loading.SetSeqEffectiveRange(read2_format_[0],
                                               read2_format_[1], 1);
  barcode_batch_for_loading.SetSeqEffectiveRange(
      barcode_format_[0], barcode_format_[1], barcode_format_[2]);
  read_batch1.SetSeqEffectiveRange(read1_format_);
  read_batch2.SetSeqEffectiveRange(read2_format_);
  barcode_batch.SetSeqEffectiveRange(barcode_format_);
  read_batch1_for_loading.SetSeqEffectiveRange(read1_format_);
  read_batch2_for_loading.SetSeqEffectiveRange(read2_format_);
  barcode_batch_for_loading.SetSeqEffectiveRange(barcode_format_);

  // Initialize cache
  mm_cache mm_to_candidates_cache(2000003);
+1 −25
Original line number Diff line number Diff line
@@ -138,31 +138,7 @@ void SequenceBatch::FinalizeLoading() {
}

void SequenceBatch::ReplaceByEffectiveRange(kstring_t &seq, bool is_seq) {
  if (effective_range_[0] == 0 && effective_range_[1] == -1 &&
      effective_range_[2] == 1) {
    return;
  }
  int i, j;
  int start = effective_range_[0];
  int end = effective_range_[1];
  if (effective_range_[1] == -1) end = seq.l - 1;
  for (i = 0; i < end - start + 1; ++i) {
    seq.s[i] = seq.s[start + i];
  }
  seq.s[i] = '\0';
  seq.l = end - start + 1;
  if (effective_range_[2] == -1) {
    if (is_seq) {
      for (i = 0; i < (int)seq.l; ++i) {
        seq.s[i] = Uint8ToChar(((uint8_t)3) ^ (CharToUint8(seq.s[i])));
      }
    }
    for (i = 0, j = seq.l - 1; i < j; ++i, --j) {
      char tmp = seq.s[i];
      seq.s[i] = seq.s[j];
      seq.s[j] = tmp;
    }
  }
  seq.l = effective_range_.Replace(seq.s, seq.l, is_seq);
}

}  // namespace chromap
+6 −11
Original line number Diff line number Diff line
@@ -9,6 +9,7 @@
#include <vector>

#include "kseq.h"
#include "sequence_effective_range.h"

namespace chromap {

@@ -16,9 +17,7 @@ class SequenceBatch {
 public:
  KSEQ_INIT(gzFile, gzread);
  SequenceBatch() {
    effective_range_[0] = 0;
    effective_range_[1] = -1;
    effective_range_[2] = 1;
    effective_range_.Init();
  }

  // Construct once and use update sequences when loading each batch.
@@ -30,9 +29,7 @@ class SequenceBatch {
      sequence_batch_.back()->f = NULL;
    }
    negative_sequence_batch_.assign(max_num_sequences_, "");
    effective_range_[0] = 0;
    effective_range_[1] = -1;
    effective_range_[2] = 1;
    effective_range_.Init();
  }

  ~SequenceBatch() {
@@ -102,10 +99,8 @@ class SequenceBatch {
    }
  }

  inline void SetSeqEffectiveRange(int start, int end, int strand) {
    effective_range_[0] = start;
    effective_range_[1] = end;
    effective_range_[2] = strand;
  inline void SetSeqEffectiveRange(const SequenceEffectiveRange &range) {
    effective_range_ = range;
  }

  //  inline char GetReverseComplementBaseOfSequenceAt(uint32_t sequence_index,
@@ -247,7 +242,7 @@ class SequenceBatch {
  kseq_t *sequence_kseq_ = nullptr;
  std::vector<kseq_t *> sequence_batch_;
  std::vector<std::string> negative_sequence_batch_;
  int effective_range_[3] = {0, -1, 1};  // actual range within each sequence.
  SequenceEffectiveRange effective_range_;  // actual range within each sequence.

  static constexpr uint8_t char_to_uint8_table_[256] = {
      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+134 −0
Original line number Diff line number Diff line
#ifndef SEQUENCEEFFECTIVERANGE_H_
#define SEQUENCEEFFECTIVERANGE_H_

#include <algorithm>
#include <vector>

#include <stdlib.h>

namespace chromap {

// The class handles the custom read format indicating 
// the effective range on a sequence.
class SequenceEffectiveRange {
 public:
  SequenceEffectiveRange() {}
  ~SequenceEffectiveRange() {}

  void Init() {
    starts.push_back(0);
    ends.push_back(-1);
    strand = 1;
    range_num = 1;
  }
  
  // Return: false - fail to parse
  bool ParseEffectiveRange(const char *s, int len) {
    int i;
    int j = 0; // start, end, strand section
    char buffer[20];
    int blen = 0;
    starts.clear();
    ends.clear();
    strand = 1;
    for (i = 3; i <= len; ++i) {
      if (i == len || s[i] == '/' || s[i] == ':') {
        buffer[blen] = '\0';
        if (j == 0) 
          starts.push_back(atoi(buffer));
        else if (j == 1)
          ends.push_back(atoi(buffer));
        else 
          strand = buffer[0] == '+' ? 1 : -1;
        blen = 0;
        if (i < len && s[i] == ':')
          ++j;
      } else {
        buffer[blen] = s[i];
        ++blen;
      }
    }
    if (j >= 3 || starts.size() != ends.size())
      return false;
    std::sort(starts.begin(), starts.end());
    std::sort(ends.begin(), ends.end());
    range_num = starts.size();
    if (ends[0] == -1) {
      for (i = 0; i < range_num - 1; ++i)  
        ends[i] = ends[i + 1];
      ends[i] = -1;
    }
    return true;
  }

  // Check whether the effective range is the full range
  bool IsFullRange() {
    if (strand == 1 && starts[0] == 0 && ends[0] == -1)
      return true;
    return false;
  }

  // Replace by the range specified in the starts,ends section, but does not apply the strand operation.
  // return new length
  int Replace(char *s, int len, bool need_complement) {
    if (IsFullRange())
      return len;
    int i, j, k;
    i = 0;
    for (k = 0; k < range_num; ++k) {
      int start = starts[k];
      int end = ends[k];
      if (end == -1) end = len - 1;
      for (j = start; j <= end; ++i, ++j)
        s[i] = s[j];
    }
    s[i] = '\0';
    len = i;
    if (strand == -1) {
      if (need_complement) {
        for (i = 0; i < len; ++i) 
          s[i] = Uint8ToChar(((uint8_t)3) ^ (CharToUint8(s[i])));
      }

      for (i = 0, j = len - 1; i < j; ++i, --j) {
        char tmp = s[i];
        s[i] = s[j];
        s[j] = tmp;
      }
    }
    return len;
  }

 private:
  std::vector<int> starts;
  std::vector<int> ends;
  int range_num;
  int strand;

  uint8_t char_to_uint8_table_[256] = {
      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2,
      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
      4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4,
      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
  char uint8_to_char_table_[8] = {'A', 'C', 'G', 'T',
                                                   'N', 'N', 'N', 'N'};
  uint8_t CharToUint8(const char c) {
    return char_to_uint8_table_[(uint8_t)c];
  }

  char Uint8ToChar(const uint8_t i) {
    return uint8_to_char_table_[i];
  }

};

}

#endif