Commit 8a275c17 authored by mourisl's avatar mourisl Committed by Li Song
Browse files

Fix an issue of keeping wrong mapq read during deduplication

parent 2e291b19
Loading
Loading
Loading
Loading
+6 −4
Original line number Diff line number Diff line
@@ -118,23 +118,25 @@ void MappingProcessor<MappingRecord>::RemovePCRDuplicate(
  for (uint32_t ri = 0; ri < num_reference_sequences; ++ri) {
    deduped_mappings.push_back(std::vector<MappingRecord>());
    if (mappings[ri].size() != 0) {
      // Ideally I should output the last of the dups of first mappings.
      deduped_mappings[ri].emplace_back(mappings[ri].front());
      // Haowen: Ideally I should output the last of the dups of first mappings.
      // Li: The mappings' mapq are sorted in increasing order, so we should put the last
      // map
      auto last_it = mappings[ri].begin();
      uint32_t last_dup_count = 1;

      for (auto it = ++(mappings[ri].begin()); it != mappings[ri].end(); ++it) {
        if (!((*it) == (*last_it))) {
          deduped_mappings[ri].emplace_back((*last_it));
          deduped_mappings[ri].back().num_dups_ = std::min(
              (uint32_t)std::numeric_limits<uint8_t>::max(), last_dup_count);
          last_dup_count = 1;
          deduped_mappings[ri].emplace_back((*it));
          last_it = it;
        } else {
          ++last_dup_count;
        }
        last_it = it;
      }

      deduped_mappings[ri].emplace_back((*last_it));
      deduped_mappings[ri].back().num_dups_ = std::min(
          (uint32_t)std::numeric_limits<uint8_t>::max(), last_dup_count);
      num_mappings += deduped_mappings[ri].size();
+4 −1
Original line number Diff line number Diff line
@@ -251,7 +251,6 @@ void MappingWriter<MappingRecord>::ProcessAndOutputMappingsInLowMemory(
    const bool current_mapping_is_duplicated =
        last_rid == min_rid && (current_mapping_is_duplicated_at_cell_level ||
                                current_mapping_is_duplicated_at_bulk_level);

    if (mapping_parameters_.remove_pcr_duplicates &&
        current_mapping_is_duplicated) {
      ++num_last_mapping_dups;
@@ -266,6 +265,9 @@ void MappingWriter<MappingRecord>::ProcessAndOutputMappingsInLowMemory(
          temp_dups_for_bulk_level_dedup.back().num_dups_ = 1;
        }
      }
      if (current_min_mapping.mapq_ > last_mapping.mapq_) {
        last_mapping = current_min_mapping ;
      }
    } else {
      if (!is_first_iteration) {
        if (deduplicate_at_bulk_level_for_single_cell_data) {
@@ -283,6 +285,7 @@ void MappingWriter<MappingRecord>::ProcessAndOutputMappingsInLowMemory(
          if (mapping_parameters_.Tn5_shift) {
            last_mapping.Tn5Shift();
          }

          AppendMapping(last_rid, reference, last_mapping);
          ++num_mappings_passing_filters;
          if (!mapping_parameters_.summary_metadata_file_path.empty())