Commit 50e87713 authored by Li's avatar Li
Browse files

Support the split alignment starting with gap

parent 93366593
Loading
Loading
Loading
Loading
+143 −21
Original line number Original line Diff line number Diff line
@@ -1629,7 +1629,7 @@ void Chromap<MappingRecord>::MapSingleEndReads() {
                uint32_t current_num_mappings = positive_mappings.size() + negative_mappings.size();
                uint32_t current_num_mappings = positive_mappings.size() + negative_mappings.size();
                if (current_num_mappings > 0) {
                if (current_num_mappings > 0) {
                  std::vector<std::vector<MappingRecord> > &mappings_on_diff_ref_seqs = mappings_on_diff_ref_seqs_for_diff_threads[omp_get_thread_num()];
                  std::vector<std::vector<MappingRecord> > &mappings_on_diff_ref_seqs = mappings_on_diff_ref_seqs_for_diff_threads[omp_get_thread_num()];
                  GenerateBestMappingsForSingleEndRead(min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings, read_batch, read_index, reference, barcode_batch, positive_mappings, positive_split_sites, negative_mappings, negative_split_sites, &mappings_on_diff_ref_seqs);
                  GenerateBestMappingsForSingleEndRead(positive_candidates.size(), negative_candidates.size(), repetitive_seed_length, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings, read_batch, read_index, reference, barcode_batch, positive_mappings, positive_split_sites, negative_mappings, negative_split_sites, &mappings_on_diff_ref_seqs);
                  thread_num_mappings += std::min(num_best_mappings, max_num_best_mappings_);
                  thread_num_mappings += std::min(num_best_mappings, max_num_best_mappings_);
                  ++thread_num_mapped_reads;
                  ++thread_num_mapped_reads;
                  if (num_best_mappings == 1) {
                  if (num_best_mappings == 1) {
@@ -1750,7 +1750,8 @@ void Chromap<MappingRecord>::GenerateMDTag(const char *pattern, const char *text
    int num_cigar_operations = bam_cigar_oplen(current_cigar_uint);
    int num_cigar_operations = bam_cigar_oplen(current_cigar_uint);
    if (cigar_operation == BAM_CMATCH) {
    if (cigar_operation == BAM_CMATCH) {
      for (int opi = 0; opi < num_cigar_operations; ++opi) {
      for (int opi = 0; opi < num_cigar_operations; ++opi) {
        if (reference[reference_position] == read[read_position]) {
        if (reference[reference_position] == read[read_position] 
	  || reference[reference_position] - 'a' + 'A' == read[read_position]) {
          // a match
          // a match
          ++num_matches;
          ++num_matches;
        } else {
        } else {
@@ -1788,8 +1789,54 @@ void Chromap<MappingRecord>::GenerateMDTag(const char *pattern, const char *text
  }
  }
}
}



// return: newly adjust reference start/end position (kPositive for start, kNegative for end)
template <typename MappingRecord>
template <typename MappingRecord>
void Chromap<MappingRecord>::ProcessBestMappingsForSingleEndRead(Direction mapping_direction, uint8_t mapq, int min_num_errors, int num_best_mappings, int second_min_num_errors, int num_second_best_mappings, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const SequenceBatch &barcode_batch, const std::vector<int> &best_mapping_indices, const std::vector<std::pair<int, uint64_t> > &mappings, const std::vector<int> &split_sites, int *best_mapping_index, int *num_best_mappings_reported, std::vector<std::vector<MappingRecord> > *mappings_on_diff_ref_seqs) {
int Chromap<MappingRecord>::AdjustGapBeginning(Direction mapping_direction, const char *ref, const char *read, int *gap_beginning,
	int read_end, int ref_start_position, int ref_end_position, int *n_cigar, uint32_t **cigar) {

  int i, j;
  if (mapping_direction == kPositive) {
    if (*gap_beginning <= 0) {
      return ref_start_position;
    }
    //printf("%d\n", *gap_beginning);
    for (i = *gap_beginning - 1, j = ref_start_position - 1 ; i >= 0 && j >= 0 ; --i, --j) {
      //printf("%c %c\n", read[i], ref[j]);
      if (read[i] != ref[j] && read[i] != ref[j] - 'a' + 'A') {
        break;   
      }
    }
    *gap_beginning = i + 1;
    //TODO: add soft clip in cigar
    if (n_cigar && *n_cigar > 0) {
      if (((*cigar)[0] & 0xf) == BAM_CMATCH) {
        (*cigar)[0] += (ref_start_position - 1 - j) << 4;
      }
    }
    return j + 1;
  } else {
    if (*gap_beginning <= 0) {
      return ref_end_position;
    }
    for (i = read_end + 1, j = ref_end_position + 1; read[i] && ref[j]; ++i, ++j) {
      if (read[i] != ref[j] && read[i] != ref[j] - 'a' + 'A') {
        break; 
      }
    }
    *gap_beginning = *gap_beginning + i - (read_end + 1);
    if (n_cigar && *n_cigar > 0) {
      if (((*cigar)[*n_cigar-1] & 0xf) == BAM_CMATCH) {
        (*cigar)[*n_cigar-1] += (j - (ref_end_position + 1)) << 4;
      }
    }

    return j - 1;  
  }
}

template <typename MappingRecord>
void Chromap<MappingRecord>::ProcessBestMappingsForSingleEndRead(Direction mapping_direction, uint8_t mapq, int num_candidates, uint32_t repetitive_seed_length, int min_num_errors, int num_best_mappings, int second_min_num_errors, int num_second_best_mappings, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const SequenceBatch &barcode_batch, const std::vector<int> &best_mapping_indices, const std::vector<std::pair<int, uint64_t> > &mappings, const std::vector<int> &split_sites, int *best_mapping_index, int *num_best_mappings_reported, std::vector<std::vector<MappingRecord> > *mappings_on_diff_ref_seqs) {
  int8_t mat[25];
  int8_t mat[25];
  //if (output_mapping_in_SAM_) {
  //if (output_mapping_in_SAM_) {
    int i, j, k;
    int i, j, k;
@@ -1813,11 +1860,14 @@ void Chromap<MappingRecord>::ProcessBestMappingsForSingleEndRead(Direction mappi
        uint32_t rid = mappings[mi].second >> 32;
        uint32_t rid = mappings[mi].second >> 32;
        uint32_t position = mappings[mi].second;
        uint32_t position = mappings[mi].second;
        int split_site = mapping_direction == kPositive ? 0 : read_length;
        int split_site = mapping_direction == kPositive ? 0 : read_length;
	int gap_beginning = 0;
        if (split_alignment_) {
        if (split_alignment_) {
          split_site = split_sites[mi];
          split_site = split_sites[mi] & 0xffff;
          read_length = split_site;
	  gap_beginning = split_sites[mi] >> 16 ; // beginning means the 5' end of the read
          read_length = split_site - gap_beginning;
        }
        }
        uint32_t verification_window_start_position = position + 1 > (read_length + error_threshold_) ? position + 1 - read_length - error_threshold_ : 0;
        uint32_t verification_window_start_position = position + 1 > (read_length + error_threshold_) ? position + 1 - read_length - error_threshold_ : 0;
	//printf("%d %d. %d\n", position, verification_window_start_position, read_length);
        if (position >= reference.GetSequenceLengthAt(rid)) {
        if (position >= reference.GetSequenceLengthAt(rid)) {
          verification_window_start_position = reference.GetSequenceLengthAt(rid) - error_threshold_ - read_length; 
          verification_window_start_position = reference.GetSequenceLengthAt(rid) - error_threshold_ - read_length; 
        }
        }
@@ -1825,7 +1875,7 @@ void Chromap<MappingRecord>::ProcessBestMappingsForSingleEndRead(Direction mappi
          if (split_sites[mi] < (int)read_batch.GetSequenceLengthAt(read_index)) {
          if (split_sites[mi] < (int)read_batch.GetSequenceLengthAt(read_index)) {
            split_site -= 3 * error_threshold_;
            split_site -= 3 * error_threshold_;
          } 
          } 
          read_length = split_site;
          read_length = split_site - gap_beginning;
        }
        }
        uint32_t barcode_key = 0;
        uint32_t barcode_key = 0;
        if (!is_bulk_data_) {
        if (!is_bulk_data_) {
@@ -1842,12 +1892,19 @@ void Chromap<MappingRecord>::ProcessBestMappingsForSingleEndRead(Direction mappi
            int n_cigar = 0;
            int n_cigar = 0;
            uint32_t *cigar;
            uint32_t *cigar;
            int mapping_end_position;
            int mapping_end_position;
            ksw_semi_global3(read_length + 2 * error_threshold_, reference.GetSequenceAt(rid) + verification_window_start_position, read_length, read, 5, mat, gap_open_penalties_[0], gap_extension_penalties_[0], gap_open_penalties_[1], gap_extension_penalties_[1], error_threshold_ * 2 + 1, &n_cigar, &cigar, &mapping_start_position, &mapping_end_position);
	    ksw_semi_global3(read_length + 2 * error_threshold_, reference.GetSequenceAt(rid) + verification_window_start_position, read_length, read + gap_beginning, 5, mat, gap_open_penalties_[0], gap_extension_penalties_[0], gap_open_penalties_[1], gap_extension_penalties_[1], error_threshold_ * 2 + 1, &n_cigar, &cigar, &mapping_start_position, &mapping_end_position);
            //std::cerr << verification_window_start_position << " " << read_length << " " << split_site << " " << mapping_start_position << " " << mapping_end_position << "\n";
            //std::cerr << verification_window_start_position << " " << read_length << " " << split_site << " " << mapping_start_position << " " << mapping_end_position << "\n";
	    if (gap_beginning > 0) {
	      int new_ref_start_position = AdjustGapBeginning(mapping_direction, reference.GetSequenceAt(rid), 
	        read, &gap_beginning, read_length - 1, 
	        verification_window_start_position + mapping_start_position, 
		verification_window_start_position + mapping_end_position, &n_cigar, &cigar);
	      mapping_start_position = new_ref_start_position - verification_window_start_position;
	    }
            int NM = 0;
            int NM = 0;
            std::string MD_tag = "";
            std::string MD_tag = "";
            GenerateMDTag(reference.GetSequenceAt(rid), read, verification_window_start_position + mapping_start_position, n_cigar, cigar, NM, MD_tag);
            GenerateMDTag(reference.GetSequenceAt(rid), read + gap_beginning, verification_window_start_position + mapping_start_position, n_cigar, cigar, NM, MD_tag);
            mapq = GetMAPQForSingleEndRead(error_threshold_, 0, 0, mapping_end_position - mapping_start_position, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings);
            mapq = GetMAPQForSingleEndRead(error_threshold_, num_candidates, repetitive_seed_length, mapping_end_position - mapping_start_position, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings);
            EmplaceBackMappingRecord(read_id, read_name, 1, verification_window_start_position + mapping_start_position, rid, flag, 0, is_unique, mapq, NM, n_cigar, cigar, MD_tag, &((*mappings_on_diff_ref_seqs)[rid]));
            EmplaceBackMappingRecord(read_id, read_name, 1, verification_window_start_position + mapping_start_position, rid, flag, 0, is_unique, mapq, NM, n_cigar, cigar, MD_tag, &((*mappings_on_diff_ref_seqs)[rid]));
          } else {
          } else {
            //int n_cigar = 0;
            //int n_cigar = 0;
@@ -1861,14 +1918,14 @@ void Chromap<MappingRecord>::ProcessBestMappingsForSingleEndRead(Direction mappi
            BandedTraceback(min_num_errors, reference.GetSequenceAt(rid) + verification_window_start_position, read, read_length, &mapping_start_position);
            BandedTraceback(min_num_errors, reference.GetSequenceAt(rid) + verification_window_start_position, read, read_length, &mapping_start_position);
            uint32_t fragment_start_position = verification_window_start_position + mapping_start_position;
            uint32_t fragment_start_position = verification_window_start_position + mapping_start_position;
            uint16_t fragment_length = position - fragment_start_position + 1;
            uint16_t fragment_length = position - fragment_start_position + 1;
            mapq = GetMAPQForSingleEndRead(error_threshold_, 0, 0, fragment_length, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings);
            mapq = GetMAPQForSingleEndRead(error_threshold_, num_candidates, repetitive_seed_length, fragment_length, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings);
            if (output_mapping_in_PAF_) {
            if (output_mapping_in_PAF_) {
              EmplaceBackMappingRecord(read_id, read_name, (uint16_t)read_length, barcode_key, fragment_start_position, fragment_length, mapq, direction, is_unique, 1, &((*mappings_on_diff_ref_seqs)[rid]));
              EmplaceBackMappingRecord(read_id, read_name, (uint16_t)read_length, barcode_key, fragment_start_position, fragment_length, mapq, direction, is_unique, 1, &((*mappings_on_diff_ref_seqs)[rid]));
            } else {
            } else {
              EmplaceBackMappingRecord(read_id, barcode_key, fragment_start_position, fragment_length, mapq, direction, is_unique, 1, &((*mappings_on_diff_ref_seqs)[rid]));
              EmplaceBackMappingRecord(read_id, barcode_key, fragment_start_position, fragment_length, mapq, direction, is_unique, 1, &((*mappings_on_diff_ref_seqs)[rid]));
            }
            }
          }
          }
        } else {
        } else { // reverse strand
          uint8_t direction = 0;
          uint8_t direction = 0;
          int read_start_site = read_batch.GetSequenceLengthAt(read_index) - split_site;
          int read_start_site = read_batch.GetSequenceLengthAt(read_index) - split_site;
          if (output_mapping_in_SAM_) {
          if (output_mapping_in_SAM_) {
@@ -1876,10 +1933,18 @@ void Chromap<MappingRecord>::ProcessBestMappingsForSingleEndRead(Direction mappi
            uint32_t *cigar;
            uint32_t *cigar;
            int mapping_end_position;
            int mapping_end_position;
            ksw_semi_global3(read_length + 2 * error_threshold_, reference.GetSequenceAt(rid) + verification_window_start_position + read_start_site, read_length, negative_read.data() + read_start_site, 5, mat, gap_open_penalties_[0], gap_extension_penalties_[0], gap_open_penalties_[1], gap_extension_penalties_[1], error_threshold_ * 2 + 1, &n_cigar, &cigar, &mapping_start_position, &mapping_end_position);
            ksw_semi_global3(read_length + 2 * error_threshold_, reference.GetSequenceAt(rid) + verification_window_start_position + read_start_site, read_length, negative_read.data() + read_start_site, 5, mat, gap_open_penalties_[0], gap_extension_penalties_[0], gap_open_penalties_[1], gap_extension_penalties_[1], error_threshold_ * 2 + 1, &n_cigar, &cigar, &mapping_start_position, &mapping_end_position);
	    if (gap_beginning > 0) {
	      int new_ref_end_position = AdjustGapBeginning(mapping_direction, reference.GetSequenceAt(rid), 
	        negative_read.data() + read_start_site, &gap_beginning, read_length, 
	        verification_window_start_position + mapping_start_position, 
		verification_window_start_position + mapping_end_position, &n_cigar, &cigar);
	      mapping_end_position = new_ref_end_position - verification_window_start_position; 
	      read_length = split_site - gap_beginning;
	    }
            int NM = 0;
            int NM = 0;
            std::string MD_tag = "";
            std::string MD_tag = "";
            GenerateMDTag(reference.GetSequenceAt(rid), negative_read.data() + read_start_site, verification_window_start_position + read_start_site + mapping_start_position, n_cigar, cigar, NM, MD_tag);
            GenerateMDTag(reference.GetSequenceAt(rid), negative_read.data() + read_start_site, verification_window_start_position + read_start_site + mapping_start_position, n_cigar, cigar, NM, MD_tag);
            mapq = GetMAPQForSingleEndRead(error_threshold_, 0, 0, mapping_end_position - mapping_start_position, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings);
            mapq = GetMAPQForSingleEndRead(error_threshold_, num_candidates, repetitive_seed_length, mapping_end_position - mapping_start_position, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings);
            EmplaceBackMappingRecord(read_id, read_name, 1, verification_window_start_position + read_start_site + mapping_end_position, rid, flag, 1, is_unique, mapq, NM, n_cigar, cigar, MD_tag, &((*mappings_on_diff_ref_seqs)[rid]));
            EmplaceBackMappingRecord(read_id, read_name, 1, verification_window_start_position + read_start_site + mapping_end_position, rid, flag, 1, is_unique, mapq, NM, n_cigar, cigar, MD_tag, &((*mappings_on_diff_ref_seqs)[rid]));
          } else {
          } else {
            //int n_cigar = 0;
            //int n_cigar = 0;
@@ -1893,7 +1958,7 @@ void Chromap<MappingRecord>::ProcessBestMappingsForSingleEndRead(Direction mappi
            BandedTraceback(min_num_errors, reference.GetSequenceAt(rid) + verification_window_start_position, negative_read.data() + read_start_site, read_length, &mapping_start_position);
            BandedTraceback(min_num_errors, reference.GetSequenceAt(rid) + verification_window_start_position, negative_read.data() + read_start_site, read_length, &mapping_start_position);
            uint32_t fragment_start_position = verification_window_start_position + mapping_start_position;
            uint32_t fragment_start_position = verification_window_start_position + mapping_start_position;
            uint16_t fragment_length = position - fragment_start_position + 1;
            uint16_t fragment_length = position - fragment_start_position + 1;
            mapq = GetMAPQForSingleEndRead(error_threshold_, 0, 0, fragment_length, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings);
            mapq = GetMAPQForSingleEndRead(error_threshold_, num_candidates, repetitive_seed_length, fragment_length, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings);
            if (output_mapping_in_PAF_) {
            if (output_mapping_in_PAF_) {
              EmplaceBackMappingRecord(read_id, read_name, (uint16_t)read_length, barcode_key, fragment_start_position, fragment_length, mapq, direction, is_unique, 1, &((*mappings_on_diff_ref_seqs)[rid]));
              EmplaceBackMappingRecord(read_id, read_name, (uint16_t)read_length, barcode_key, fragment_start_position, fragment_length, mapq, direction, is_unique, 1, &((*mappings_on_diff_ref_seqs)[rid]));
            } else {
            } else {
@@ -1912,7 +1977,7 @@ void Chromap<MappingRecord>::ProcessBestMappingsForSingleEndRead(Direction mappi
}
}


template <typename MappingRecord>
template <typename MappingRecord>
void Chromap<MappingRecord>::GenerateBestMappingsForSingleEndRead(int min_num_errors, int num_best_mappings, int second_min_num_errors, int num_second_best_mappings, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const SequenceBatch &barcode_batch, const std::vector<std::pair<int, uint64_t> > &positive_mappings, const std::vector<int> &positive_split_sites, const std::vector<std::pair<int, uint64_t> > &negative_mappings, const std::vector<int> &negative_split_sites, std::vector<std::vector<MappingRecord> > *mappings_on_diff_ref_seqs) {
void Chromap<MappingRecord>::GenerateBestMappingsForSingleEndRead(int num_positive_candidates, int num_negative_candidates, uint32_t repetitive_seed_length, int min_num_errors, int num_best_mappings, int second_min_num_errors, int num_second_best_mappings, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const SequenceBatch &barcode_batch, const std::vector<std::pair<int, uint64_t> > &positive_mappings, const std::vector<int> &positive_split_sites, const std::vector<std::pair<int, uint64_t> > &negative_mappings, const std::vector<int> &negative_split_sites, std::vector<std::vector<MappingRecord> > *mappings_on_diff_ref_seqs) {
  //uint8_t mapq = GetMAPQ(num_best_mappings, num_second_best_mappings);
  //uint8_t mapq = GetMAPQ(num_best_mappings, num_second_best_mappings);
  uint8_t mapq = 0;
  uint8_t mapq = 0;
  // we will use reservoir sampling 
  // we will use reservoir sampling 
@@ -1931,9 +1996,9 @@ void Chromap<MappingRecord>::GenerateBestMappingsForSingleEndRead(int min_num_er
  }
  }
  int best_mapping_index = 0;
  int best_mapping_index = 0;
  int num_best_mappings_reported = 0;
  int num_best_mappings_reported = 0;
  ProcessBestMappingsForSingleEndRead(kPositive, mapq, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings, read_batch, read_index, reference, barcode_batch, best_mapping_indices, positive_mappings, positive_split_sites, &best_mapping_index, &num_best_mappings_reported, mappings_on_diff_ref_seqs);
  ProcessBestMappingsForSingleEndRead(kPositive, mapq, num_positive_candidates, repetitive_seed_length, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings, read_batch, read_index, reference, barcode_batch, best_mapping_indices, positive_mappings, positive_split_sites, &best_mapping_index, &num_best_mappings_reported, mappings_on_diff_ref_seqs);
  if (num_best_mappings_reported != std::min(num_best_mappings, max_num_best_mappings_)) {
  if (num_best_mappings_reported != std::min(num_best_mappings, max_num_best_mappings_)) {
    ProcessBestMappingsForSingleEndRead(kNegative, mapq, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings, read_batch, read_index, reference, barcode_batch, best_mapping_indices, negative_mappings, negative_split_sites, &best_mapping_index, &num_best_mappings_reported, mappings_on_diff_ref_seqs);
    ProcessBestMappingsForSingleEndRead(kNegative, num_negative_candidates, repetitive_seed_length, mapq, min_num_errors, num_best_mappings, second_min_num_errors, num_second_best_mappings, read_batch, read_index, reference, barcode_batch, best_mapping_indices, negative_mappings, negative_split_sites, &best_mapping_index, &num_best_mappings_reported, mappings_on_diff_ref_seqs);
  }
  }
}
}


@@ -2444,14 +2509,47 @@ void Chromap<MappingRecord>::VerifyCandidatesOnOneDirection(Direction candidate_
    if (position < (uint32_t)error_threshold_ || position >= reference.GetSequenceLengthAt(rid) || position + read_length + error_threshold_ >= reference.GetSequenceLengthAt(rid)) {
    if (position < (uint32_t)error_threshold_ || position >= reference.GetSequenceLengthAt(rid) || position + read_length + error_threshold_ >= reference.GetSequenceLengthAt(rid)) {
      continue;
      continue;
    }
    }
    int mapping_end_position;
    int mapping_end_position = read_length;
    int gap_beginning = 0;
    int num_errors;
    int num_errors;
    int allow_gap_beginning_ = 20;
    int allow_gap_beginning = allow_gap_beginning_ - error_threshold_;
    
    if (split_alignment_) {
    if (split_alignment_) {
      int mapping_length_threshold = 30;
      int mapping_length_threshold = 30;

      if (candidate_direction == kPositive) {
      if (candidate_direction == kPositive) {
        num_errors = BandedAlignPatternToTextWithDropOff(reference.GetSequenceAt(rid) + position - error_threshold_, read, read_length, &mapping_end_position);
        num_errors = BandedAlignPatternToTextWithDropOff(reference.GetSequenceAt(rid) + position - error_threshold_, read, read_length, &mapping_end_position);
	if (mapping_end_position < 0 && allow_gap_beginning > 0) {
	  int backup_num_errors = num_errors;
	  int backup_mapping_end_position = mapping_end_position;
          num_errors = BandedAlignPatternToTextWithDropOff(reference.GetSequenceAt(rid) + position - error_threshold_ + allow_gap_beginning, read + allow_gap_beginning, read_length - allow_gap_beginning, &mapping_end_position);
	  if (num_errors > error_threshold_ || mapping_end_position < 0) {
	    num_errors = backup_num_errors;
	    mapping_end_position = backup_mapping_end_position;
	  }
	  else {
            gap_beginning = allow_gap_beginning;
	    mapping_end_position += gap_beginning; // realign the mapping end position as it is the alignment from the whole read
	                                           // I use this adjustment since "position" is based on the whole read,
						   // and it will be more consistent with no gap beginning case
	  }
	}
      } else {
      } else {
        num_errors = BandedAlignPatternToTextWithDropOffFrom3End(reference.GetSequenceAt(rid) + position - error_threshold_, negative_read.data(), read_length, &mapping_end_position);
        num_errors = BandedAlignPatternToTextWithDropOffFrom3End(reference.GetSequenceAt(rid) + position - error_threshold_, negative_read.data(), read_length, &mapping_end_position);
	if (mapping_end_position < 0 && allow_gap_beginning > 0) {
	  int backup_num_errors = num_errors;
	  int backup_mapping_end_position = mapping_end_position;
          num_errors = BandedAlignPatternToTextWithDropOffFrom3End(reference.GetSequenceAt(rid) + position - error_threshold_, negative_read.data(), read_length - allow_gap_beginning, &mapping_end_position);
	  if (num_errors > error_threshold_ || mapping_end_position < 0) {
	    num_errors = backup_num_errors;
	    mapping_end_position = backup_mapping_end_position;
	  }
	  else {
            gap_beginning = allow_gap_beginning;
            mapping_end_position += gap_beginning; 
	  }
	}
      }  
      }  
      //std::cerr << "ne1: " << num_errors << " " << mapping_end_position << "\n";
      //std::cerr << "ne1: " << num_errors << " " << mapping_end_position << "\n";
      //if (num_errors > 2 * error_threshold_) {
      //if (num_errors > 2 * error_threshold_) {
@@ -2492,11 +2590,12 @@ void Chromap<MappingRecord>::VerifyCandidatesOnOneDirection(Direction candidate_
        //if (split_alignment_) {
        //if (split_alignment_) {
        //  mappings->emplace_back(num_errors, candidates[ci].position - read_length + 1 - error_threshold_ + read_length + error_threshold_); 
        //  mappings->emplace_back(num_errors, candidates[ci].position - read_length + 1 - error_threshold_ + read_length + error_threshold_); 
        //} else {
        //} else {
          mappings->emplace_back(num_errors, candidates[ci].position - read_length + 1 - error_threshold_ + mapping_end_position); 
          // Need to minus gap_beginning because mapping_end_position is adjusted by it, but read_length is not.
          mappings->emplace_back(num_errors, candidates[ci].position - read_length + 1 - error_threshold_ + mapping_end_position - gap_beginning);
        //}
        //}
      }
      }
      if (split_alignment_) {
      if (split_alignment_) {
        split_sites->emplace_back(mapping_end_position - error_threshold_);
        split_sites->emplace_back((gap_beginning<<16)| (mapping_end_position - error_threshold_) );
      }
      }
    }
    }
  }
  }
@@ -2654,6 +2753,7 @@ int Chromap<MappingRecord>::BandedAlignPatternToText(const char *pattern, const
  return min_num_errors;
  return min_num_errors;
}
}


// Return negative number if the termination are deemed at the beginning of the read
template <typename MappingRecord>
template <typename MappingRecord>
int Chromap<MappingRecord>::BandedAlignPatternToTextWithDropOff(const char *pattern, const char *text, const int read_length, int *mapping_end_position) {
int Chromap<MappingRecord>::BandedAlignPatternToTextWithDropOff(const char *pattern, const char *text, const int read_length, int *mapping_end_position) {
  uint32_t Peq[5] = {0, 0, 0, 0, 0};
  uint32_t Peq[5] = {0, 0, 0, 0, 0};
@@ -2671,6 +2771,7 @@ int Chromap<MappingRecord>::BandedAlignPatternToTextWithDropOff(const char *patt
  uint32_t HP = 0;
  uint32_t HP = 0;
  int num_errors_at_band_start_position = 0;
  int num_errors_at_band_start_position = 0;
  int i = 0;
  int i = 0;
  int fail_beginning = 0; // the alignment failed at the beginning part
  for (; i < read_length; i++) {
  for (; i < read_length; i++) {
    uint8_t pattern_base = SequenceBatch::CharToUint8(pattern[i + 2 * error_threshold_]);
    uint8_t pattern_base = SequenceBatch::CharToUint8(pattern[i + 2 * error_threshold_]);
    Peq[pattern_base] = Peq[pattern_base] | highest_bit_in_band_mask;
    Peq[pattern_base] = Peq[pattern_base] | highest_bit_in_band_mask;
@@ -2684,12 +2785,21 @@ int Chromap<MappingRecord>::BandedAlignPatternToTextWithDropOff(const char *patt
    num_errors_at_band_start_position += 1 - (D0 & lowest_bit_in_band_mask);
    num_errors_at_band_start_position += 1 - (D0 & lowest_bit_in_band_mask);
    if (num_errors_at_band_start_position > 2 * error_threshold_) {
    if (num_errors_at_band_start_position > 2 * error_threshold_) {
      //return error_threshold_ + 1;
      //return error_threshold_ + 1;
      if (i < 4 * error_threshold_ && i < read_length / 2) {
        fail_beginning = 1;
      }
      break;
      break;
    }
    }
    for (int ai = 0; ai < 5; ai++) {
    for (int ai = 0; ai < 5; ai++) {
      Peq[ai] >>= 1;
      Peq[ai] >>= 1;
    }
    }
  }
  }

  /*char tmp[255] ;
  strncpy(tmp, pattern, read_length + 2 * error_threshold_);
  printf("%s\n%s\n", tmp, text);
  printf("%d\n", i) ;
  fflush(stdout);*/
  int band_start_position = i;
  int band_start_position = i;
  int min_num_errors = num_errors_at_band_start_position;
  int min_num_errors = num_errors_at_band_start_position;
  *mapping_end_position = band_start_position;
  *mapping_end_position = band_start_position;
@@ -2701,9 +2811,13 @@ int Chromap<MappingRecord>::BandedAlignPatternToTextWithDropOff(const char *patt
      *mapping_end_position = band_start_position + 1 + i;
      *mapping_end_position = band_start_position + 1 + i;
    }
    }
  }
  }
  if (fail_beginning) {
    *mapping_end_position = -*mapping_end_position;
  }
  return min_num_errors;
  return min_num_errors;
}
}



template <typename MappingRecord>
template <typename MappingRecord>
int Chromap<MappingRecord>::BandedAlignPatternToTextWithDropOffFrom3End(const char *pattern, const char *text, const int read_length, int *mapping_end_position) {
int Chromap<MappingRecord>::BandedAlignPatternToTextWithDropOffFrom3End(const char *pattern, const char *text, const int read_length, int *mapping_end_position) {
  uint32_t Peq[5] = {0, 0, 0, 0, 0};
  uint32_t Peq[5] = {0, 0, 0, 0, 0};
@@ -2721,6 +2835,7 @@ int Chromap<MappingRecord>::BandedAlignPatternToTextWithDropOffFrom3End(const ch
  uint32_t HP = 0;
  uint32_t HP = 0;
  int num_errors_at_band_start_position = 0;
  int num_errors_at_band_start_position = 0;
  int i = 0;
  int i = 0;
  int fail_beginning = 0; // the alignment failed at the beginning part
  for (; i < read_length; i++) {
  for (; i < read_length; i++) {
    uint8_t pattern_base = SequenceBatch::CharToUint8(pattern[read_length - 1 - i]);
    uint8_t pattern_base = SequenceBatch::CharToUint8(pattern[read_length - 1 - i]);
    Peq[pattern_base] = Peq[pattern_base] | highest_bit_in_band_mask;
    Peq[pattern_base] = Peq[pattern_base] | highest_bit_in_band_mask;
@@ -2734,12 +2849,16 @@ int Chromap<MappingRecord>::BandedAlignPatternToTextWithDropOffFrom3End(const ch
    num_errors_at_band_start_position += 1 - (D0 & lowest_bit_in_band_mask);
    num_errors_at_band_start_position += 1 - (D0 & lowest_bit_in_band_mask);
    if (num_errors_at_band_start_position > 2 * error_threshold_) {
    if (num_errors_at_band_start_position > 2 * error_threshold_) {
      //return error_threshold_ + 1;
      //return error_threshold_ + 1;
      if (i < 4 * error_threshold_ && i < read_length / 2) {
        fail_beginning = 1;
      }
      break;
      break;
    }
    }
    for (int ai = 0; ai < 5; ai++) {
    for (int ai = 0; ai < 5; ai++) {
      Peq[ai] >>= 1;
      Peq[ai] >>= 1;
    }
    }
  }
  }
  //printf("li %d: %d %d %d\n", fail_beginning, i, error_threshold_, read_length);
  int band_start_position = i;
  int band_start_position = i;
  int min_num_errors = num_errors_at_band_start_position;
  int min_num_errors = num_errors_at_band_start_position;
  *mapping_end_position = band_start_position;
  *mapping_end_position = band_start_position;
@@ -2751,6 +2870,9 @@ int Chromap<MappingRecord>::BandedAlignPatternToTextWithDropOffFrom3End(const ch
      *mapping_end_position = band_start_position + (1 + i);
      *mapping_end_position = band_start_position + (1 + i);
    }
    }
  }
  }
  if (fail_beginning) {
    *mapping_end_position = -*mapping_end_position;
  }
  return min_num_errors;
  return min_num_errors;
}
}


+4 −2

File changed.

Preview size limit exceeded, changes collapsed.