Commit b9de5fcd authored by Li's avatar Li Committed by Haowen Zhang
Browse files

Add the strand field in the --read-format option.

parent d0c57d40
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -151,8 +151,8 @@ Chromap also supports user-defined barcode format, including mixed barcode and g
data case. User can specify the sequence structure through option **--read-format**. The value
is a comma-separated string, each field in the string is also a semi-comma-splitted string

    [r1|r2|bc]:start:end
The start and end are inclusive and -1 means the end of the read. For example,
    [r1|r2|bc]:start:end:strand
The start and end are inclusive and -1 means the end of the read. The strand is presented by '+' and '-' symbol, if '-' the barcode will be reverse-complemented after extraction. The strand symbol can be omitted if it is '+' and is ignored on r1 and r2. For example,
when the barcode is in the first 16bp of read1, one can use the option 
`-1 read1.fq.gz -2 read2.fq.gz --barcode read1.fq.gz --read-format bc:0:15,r1:16:-1`

+33 −17
Original line number Diff line number Diff line
@@ -1312,15 +1312,16 @@ void Chromap<MappingRecord>::MapPairedEndReads() {
  SequenceBatch read_batch2_for_loading(read_batch_size_);
  SequenceBatch barcode_batch_for_loading(read_batch_size_);

  read_batch1.SetSeqEffectiveRange(read1_format_[0], read1_format_[1]);
  read_batch2.SetSeqEffectiveRange(read2_format_[0], read2_format_[1]);
  barcode_batch.SetSeqEffectiveRange(barcode_format_[0], barcode_format_[1]);
  read_batch1.SetSeqEffectiveRange(read1_format_[0], read1_format_[1], 1);
  read_batch2.SetSeqEffectiveRange(read2_format_[0], read2_format_[1], 1);
  barcode_batch.SetSeqEffectiveRange(barcode_format_[0], barcode_format_[1], barcode_format_[2]);
  read_batch1_for_loading.SetSeqEffectiveRange(read1_format_[0],
                                               read1_format_[1]);
                                               read1_format_[1], 1);
  read_batch2_for_loading.SetSeqEffectiveRange(read2_format_[0],
                                               read2_format_[1]);
                                               read2_format_[1], 1);
  barcode_batch_for_loading.SetSeqEffectiveRange(barcode_format_[0],
                                                 barcode_format_[1]);
                                                 barcode_format_[1],
                                                 barcode_format_[2]);

  // Initialize cache
  mm_cache mm_to_candidates_cache(2000003);
@@ -2744,12 +2745,13 @@ void Chromap<MappingRecord>::MapSingleEndReads() {
  SequenceBatch read_batch_for_loading(read_batch_size_);
  SequenceBatch barcode_batch(read_batch_size_);
  SequenceBatch barcode_batch_for_loading(read_batch_size_);
  read_batch.SetSeqEffectiveRange(read1_format_[0], read1_format_[1]);
  read_batch.SetSeqEffectiveRange(read1_format_[0], read1_format_[1], 1);
  read_batch_for_loading.SetSeqEffectiveRange(read1_format_[0],
                                              read1_format_[1]);
  barcode_batch.SetSeqEffectiveRange(barcode_format_[0], barcode_format_[1]);
                                              read1_format_[1], 1);
  barcode_batch.SetSeqEffectiveRange(barcode_format_[0], barcode_format_[1], barcode_format_[2]);
  barcode_batch_for_loading.SetSeqEffectiveRange(barcode_format_[0],
                                                 barcode_format_[1]);
                                                 barcode_format_[1],
                                                 barcode_format_[2]);

  mappings_on_diff_ref_seqs_.reserve(num_reference_sequences);
  deduped_mappings_on_diff_ref_seqs_.reserve(num_reference_sequences);
@@ -5771,19 +5773,25 @@ void Chromap<MappingRecord>::ParseReadFormat(const std::string &read_format) {
  int k = 0;  // for read1, read2, or barcode
  read1_format_[0] = 0;
  read1_format_[1] = -1;
  read1_format_[2] = 1;
  read2_format_[0] = 0;
  read2_format_[1] = -1; 
  read2_format_[2] = 1;
  barcode_format_[0] = 0;
  barcode_format_[1] = -1;
  int fields[2] = {0, -1};
  barcode_format_[2] = 1;
  int fields[3] = {0, -1, 1};
  char buffer[20];
  int blen = 0;
  for (i = 0; i < read_format.size(); ++i) {
    if (read_format[i] == ',' || i == 0) {
      if (i > 0) {
        buffer[blen] = '\0';
        if (j <= 1) {
          fields[j] = atoi(buffer);

        } else {
          fields[j] = buffer[0]=='+'?1:-1;
        }
        if (k == 0)
          memcpy(read1_format_, fields, sizeof(fields));
        else if (k == 1)
@@ -5807,7 +5815,7 @@ void Chromap<MappingRecord>::ParseReadFormat(const std::string &read_format) {
      blen = 0;
    } else {
      if (read_format[i] != ':') {
        if (j < 2) {
        if (j < 3) {
          buffer[blen] = read_format[i];
          ++blen;
        } else {
@@ -5815,14 +5823,22 @@ void Chromap<MappingRecord>::ParseReadFormat(const std::string &read_format) {
        }
      } else {
        buffer[blen] = '\0';
        if (j <= 1) {
          fields[j] = atoi(buffer);
        } else {
          fields[j] = buffer[0]=='+'?1:-1;
        }
        ++j;
        blen = 0;
      }
    }
  }
  buffer[blen] = '\0';
  if (j <= 1) {
    fields[j] = atoi(buffer);
  } else {
    fields[j] = buffer[0]=='+'?1:-1;
  }
  // By initialization, it is fine even if there is no read_format specified
  if (k == 0)
    memcpy(read1_format_, fields, sizeof(fields));
+4 −4
Original line number Diff line number Diff line
@@ -18,7 +18,7 @@
#include "sequence_batch.h"
#include "temp_mapping.h"

#define CHROMAP_VERSION "0.1.4-r284"
#define CHROMAP_VERSION "0.1.3-r257"

namespace chromap {
struct uint128_t {
@@ -591,9 +591,9 @@ class Chromap {
  std::vector<std::string> read_file2_paths_;
  std::vector<std::string> barcode_file_paths_;
  std::string barcode_whitelist_file_path_;
  int barcode_format_[2];  // 0-start, 1-end (includsive)
  int read1_format_[2];
  int read2_format_[2];
  int barcode_format_[3];  // 0-start, 1-end (includsive), 2-strand(-1:minus, 1:plus)
  int read1_format_[3];
  int read2_format_[3];
  std::string mapping_output_file_path_;
  FILE *mapping_output_file_;
  std::string matrix_output_prefix_;
+19 −4
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@ class SequenceBatch {
  SequenceBatch() {
    effective_range_[0] = 0;
    effective_range_[1] = -1;
    effective_range_[2] = 1;
  }
  SequenceBatch(uint32_t max_num_sequences)
      : max_num_sequences_(max_num_sequences) {
@@ -29,6 +30,7 @@ class SequenceBatch {
    negative_sequence_batch_.assign(max_num_sequences_, "");
    effective_range_[0] = 0;
    effective_range_[1] = -1;
    effective_range_[2] = 1;
  }
  ~SequenceBatch() {
    if (sequence_batch_.size() > 0) {
@@ -66,9 +68,10 @@ class SequenceBatch {
    return negative_sequence_batch_[sequence_index];
  }
  inline int GetSequenceBatchSize() const { return sequence_batch_.size(); }
  inline void SetSeqEffectiveRange(int start, int end) {
  inline void SetSeqEffectiveRange(int start, int end, int strand) {
    effective_range_[0] = start;
    effective_range_[1] = end;
    effective_range_[2] = strand;
  }
  //  inline char GetReverseComplementBaseOfSequenceAt(uint32_t sequence_index,
  //  uint32_t position) {
@@ -193,7 +196,7 @@ class SequenceBatch {
  kseq_t *sequence_kseq_;
  std::vector<kseq_t *> sequence_batch_;
  std::vector<std::string> negative_sequence_batch_;
  int effective_range_[2];  // actual range within each sequence.
  int effective_range_[3];  // actual range within each sequence.
  static constexpr uint8_t char_to_uint8_table_[256] = {
      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
@@ -209,8 +212,10 @@ class SequenceBatch {
  static constexpr char uint8_to_char_table_[8] = {'A', 'C', 'G', 'T',
                                                   'N', 'N', 'N', 'N'};
  void ReplaceByEffectiveRange(kstring_t &seq) {
    if (effective_range_[0] == 0 && effective_range_[1] == -1) return;
    int i;
    if (effective_range_[0] == 0 && effective_range_[1] == -1 && effective_range_[2] == 1) {
      return;
    }
    int i, j;
    int start = effective_range_[0];
    int end = effective_range_[1];
    if (effective_range_[1] == -1) end = seq.l - 1;
@@ -219,6 +224,16 @@ class SequenceBatch {
    }
    seq.s[i] = '\0';
    seq.l = end - start + 1;
    if (effective_range_[2] == -1) {
      for (i = 0 ; i < (int)seq.l ; ++i) {
        seq.s[i] = Uint8ToChar(((uint8_t)3)^(CharToUint8(seq.s[i])));
      }
      for (i = 0, j = seq.l - 1; i < j ; ++i, --j) {
        char tmp = seq.s[i];
        seq.s[i] = seq.s[j];
        seq.s[j] = tmp;
      }
    }
  }
};
}  // namespace chromap