Commit 34905b18 authored by Li's avatar Li Committed by Haowen Zhang
Browse files

Use multiple read format fields to specify non-consecutive sequence. Fix several bugs in parsing.

parent de5470ae
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -152,8 +152,8 @@ Chromap also supports user-defined barcode format, including mixed barcode and g
data case. User can specify the sequence structure through option **--read-format**. The value
is a comma-separated string, each field in the string is also a semi-comma-splitted string

    [r1|r2|bc]:start0/start1/...:end0/end1/...:strand
The start and end are inclusive and -1 means the end of the read. User may use '/' symbol to specify multiple segments. The strand is presented by '+' and '-' symbol, if '-' the barcode will be reverse-complemented after extraction. The strand symbol can be omitted if it is '+' and is ignored on r1 and r2. For example,
    [r1|r2|bc]:start:end:strand
The start and end are inclusive and -1 means the end of the read. User may use multiple fields to specify non-consecutive segments, e.g. bc:0:15,bc:32:-1. The strand is presented by '+' and '-' symbol, if '-' the barcode will be reverse-complemented after extraction. The strand symbol can be omitted if it is '+' and is ignored on r1 and r2. For example,
when the barcode is in the first 16bp of read1, one can use the option 
`-1 read1.fq.gz -2 read2.fq.gz --barcode read1.fq.gz --read-format bc:0:15,r1:16:-1`

+2 −3
Original line number Diff line number Diff line
@@ -720,9 +720,8 @@ void Chromap::ParseReadFormat(const std::string &read_format) {

  uint32_t i, j;
  for (i = 0; i < read_format.size();) {
    for (j = i + 1; j < read_format.size() && j != ','; ++j)
    for (j = i + 1; j < read_format.size() && read_format[j] != ','; ++j)
      ;

    bool parse_success = true;
    if (read_format[i] == 'r' && read_format[i + 1] == '1') {
      parse_success = read1_effective_range_.ParseEffectiveRange(
@@ -741,7 +740,7 @@ void Chromap::ParseReadFormat(const std::string &read_format) {
      ExitWithMessage("Unknown read format: " + read_format + "\n");
    }

    i = j;
    i = j + 1;
  }
}

+11 −6
Original line number Diff line number Diff line
@@ -22,6 +22,7 @@ class SequenceEffectiveRange {
    ends.push_back(-1);
    strand = 1;
    range_num = 1;
    default_range = true;
  }

  // Return false if it fails to parse the format string.
@@ -30,14 +31,17 @@ class SequenceEffectiveRange {
    int j = 0;  // start, end, strand section
    char buffer[20];
    int blen = 0;
   
    if (default_range) {
      starts.clear();
      ends.clear();
      strand = 1;
      default_range = false;
    }
  
    for (i = 3; i <= len; ++i) {
      if (i == len || s[i] == '/' || s[i] == ':') {
      if (i == len || s[i] == ':') {
        buffer[blen] = '\0';

        if (j == 0) {
          starts.push_back(atoi(buffer));
        } else if (j == 1) {
@@ -121,6 +125,7 @@ class SequenceEffectiveRange {
  std::vector<int> ends;
  int range_num;
  int strand;
  bool default_range; // whether the range has been modified by new input
};

}  // namespace chromap