Commit 935d24df authored by Haowen Zhang's avatar Haowen Zhang
Browse files

Reduce the number of function parameters.

1. introduced MappingMetadata which owns the mapping metadata,
   e.g., minimizers, candidates, mappings and etc.
2. fixed a bug in single end cache, which forgot to update
   repetitive seed length.
3. need more tests on mapping single end reads.
4. more functions with too many parameters should be improved
   later.
parent 8f6e6b3a
Loading
Loading
Loading
Loading
+522 −574

File changed.

Preview size limit exceeded, changes collapsed.

+33 −63
Original line number Diff line number Diff line
@@ -13,7 +13,7 @@
#include "index.h"
#include "khash.h"
#include "ksort.h"
#include "mapping_buffer.h"
#include "mapping_metadata.h"
#include "output_tools.h"
#include "sequence_batch.h"
#include "temp_mapping.h"
@@ -265,14 +265,7 @@ class Chromap {
      std::vector<Candidate> &filtered_candidates1,
      std::vector<Candidate> &filtered_candidates2);
  void ReduceCandidatesForPairedEndRead(
      const std::vector<Candidate> &positive_candidates1,
      const std::vector<Candidate> &negative_candidates1,
      const std::vector<Candidate> &positive_candidates2,
      const std::vector<Candidate> &negative_candidates2,
      std::vector<Candidate> &filtered_positive_candidates1,
      std::vector<Candidate> &filtered_negative_candidates1,
      std::vector<Candidate> &filtered_positive_candidates2,
      std::vector<Candidate> &filtered_negative_candidates2);
      PairedEndMappingMetadata &paired_end_mapping_metadata);
  void GenerateBestMappingsForPairedEndReadOnOneDirection(
      Direction first_read_direction, uint32_t pair_index, int num_candidates1,
      int min_num_errors1, int num_best_mappings1, int second_min_num_errors1,
@@ -283,8 +276,8 @@ class Chromap {
      const SequenceBatch &read_batch2, const SequenceBatch &reference,
      const std::vector<std::pair<int, uint64_t> > &mappings2,
      std::vector<std::pair<uint32_t, uint32_t> > &best_mappings,
      int *min_sum_errors, int *num_best_mappings, int *second_min_sum_errors,
      int *num_second_best_mappings);
      int &min_sum_errors, int &num_best_mappings, int &second_min_sum_errors,
      int &num_second_best_mappings);
  void RecalibrateBestMappingsForPairedEndReadOnOneDirection(
      Direction first_read_direction, uint32_t pair_index, int min_sum_errors,
      int second_min_sum_errors, int min_num_errors1, int num_best_mappings1,
@@ -316,23 +309,16 @@ class Chromap {
      const std::vector<int> &split_sites2,
      const std::vector<std::pair<uint32_t, uint32_t> > &best_mappings,
      int min_sum_errors, int num_best_mappings, int second_min_sum_errors,
      int num_second_best_mappings, int *best_mapping_index,
      int *num_best_mappings_reported, int force_mapq,
      std::vector<std::vector<MappingRecord> > *mappings_on_diff_ref_seqs);
      int num_second_best_mappings, int &best_mapping_index,
      int &num_best_mappings_reported, int force_mapq,
      std::vector<std::vector<MappingRecord> > &mappings_on_diff_ref_seqs);
  void GenerateBestMappingsForPairedEndRead(
      uint32_t pair_index, int num_positive_candidates1,
      int num_negative_candidates1, uint32_t repetitive_seed_length1,
      int min_num_errors1, int num_best_mappings1, int second_min_num_errors1,
      int num_second_best_mappings1, const SequenceBatch &read_batch1,
      int num_positive_candidates2, int num_negative_candidates2,
      uint32_t repetitive_seed_length2, int min_num_errors2,
      int num_best_mappings2, int second_min_num_errors2,
      int num_second_best_mappings2, const SequenceBatch &read_batch2,
      const SequenceBatch &reference, const SequenceBatch &barcode_batch,
      std::vector<int> *best_mapping_indices, MappingBuffer &mapping_buffer,
      std::mt19937 *generator, int *min_sum_errors, int *num_best_mappings,
      int *second_min_sum_errors, int *num_second_best_mappings, int force_mapq,
      std::vector<std::vector<MappingRecord> > *mappings_on_diff_ref_seqs);
      uint32_t pair_index, const SequenceBatch &read_batch1,
      const SequenceBatch &read_batch2, const SequenceBatch &barcode_batch,
      const SequenceBatch &reference, std::vector<int> &best_mapping_indices,
      std::mt19937 &generator, int force_mapq,
      PairedEndMappingMetadata &paired_end_mapping_metadata,
      std::vector<std::vector<MappingRecord> > &mappings_on_diff_ref_seqs);
  void EmplaceBackMappingRecord(
      uint32_t read_id, uint64_t barcode, uint32_t fragment_start_position,
      uint16_t fragment_length, uint8_t mapq, uint8_t direction,
@@ -358,17 +344,10 @@ class Chromap {
  // For single-end read mapping
  void MapSingleEndReads();
  void GenerateBestMappingsForSingleEndRead(
      int num_positive_candidates, int num_negative_candidates,
      uint32_t repetitive_seed_length, int min_num_errors,
      int num_best_mappings, int second_min_num_errors,
      int num_second_best_mappings, const SequenceBatch &read_batch,
      uint32_t read_index, const SequenceBatch &reference,
      const SequenceBatch &barcode_batch,
      const std::vector<std::pair<int, uint64_t> > &positive_mappings,
      const std::vector<int> &positive_split_sites,
      const std::vector<std::pair<int, uint64_t> > &negative_mappings,
      const std::vector<int> &negative_split_sites,
      std::vector<std::vector<MappingRecord> > *mappings_on_diff_ref_seqs);
      const SequenceBatch &read_batch, uint32_t read_index,
      const SequenceBatch &reference, const SequenceBatch &barcode_batch,
      MappingMetadata &mapping_metadata,
      std::vector<std::vector<MappingRecord> > &mappings_on_diff_ref_seqs);
  void ProcessBestMappingsForSingleEndRead(
      Direction mapping_direction, uint8_t mapq, int num_candidates,
      uint32_t repetitive_seed_length, int min_num_errors,
@@ -378,9 +357,9 @@ class Chromap {
      const SequenceBatch &barcode_batch,
      const std::vector<int> &best_mapping_indices,
      const std::vector<std::pair<int, uint64_t> > &mappings,
      const std::vector<int> &split_sites, int *best_mapping_index,
      int *num_best_mappings_reported,
      std::vector<std::vector<MappingRecord> > *mappings_on_diff_ref_seqs);
      const std::vector<int> &split_sites, int &best_mapping_index,
      int &num_best_mappings_reported,
      std::vector<std::vector<MappingRecord> > &mappings_on_diff_ref_seqs);
  void EmplaceBackMappingRecord(
      uint32_t read_id, uint32_t barcode, uint32_t fragment_start_position,
      uint16_t fragment_length, uint8_t mapq, uint8_t direction,
@@ -437,9 +416,9 @@ class Chromap {
                            const int read_length);
  void MergeCandidates(std::vector<Candidate> &c1, std::vector<Candidate> &c2,
                       std::vector<Candidate> &buffer);
  int SupplementCandidates(const Index &index, uint32_t repetitive_seed_length1,
                           uint32_t repetitive_seed_length2,
                           MappingBuffer &mapping_buffer);
  int SupplementCandidates(
      const Index &index,
      PairedEndMappingMetadata &paired_end_mapping_metadata);
  void PostProcessingInLowMemory(uint32_t num_mappings_in_mem,
                                 uint32_t num_reference_sequences,
                                 const SequenceBatch &reference);
@@ -447,29 +426,20 @@ class Chromap {
      Direction candidate_direction, const SequenceBatch &read_batch,
      uint32_t read_index, const SequenceBatch &reference,
      const std::vector<Candidate> &candidates,
      std::vector<std::pair<int, uint64_t> > *mappings, int *min_num_errors,
      int *num_best_mappings, int *second_min_num_errors,
      int *num_second_best_mappings);
      std::vector<std::pair<int, uint64_t> > &mappings, int &min_num_errors,
      int &num_best_mappings, int &second_min_num_errors,
      int &num_second_best_mappings);
  void VerifyCandidatesOnOneDirection(
      Direction candidate_direction, const SequenceBatch &read_batch,
      uint32_t read_index, const SequenceBatch &reference,
      const std::vector<Candidate> &candidates,
      std::vector<std::pair<int, uint64_t> > *mappings,
      std::vector<int> *split_sites, int *min_num_errors,
      int *num_best_mappings, int *second_min_num_errors,
      int *num_second_best_mappings);
  void VerifyCandidates(
      const SequenceBatch &read_batch, uint32_t read_index,
      std::vector<std::pair<int, uint64_t> > &mappings,
      std::vector<int> &split_sites, int &min_num_errors,
      int &num_best_mappings, int &second_min_num_errors,
      int &num_second_best_mappings);
  void VerifyCandidates(const SequenceBatch &read_batch, uint32_t read_index,
                        const SequenceBatch &reference,
      const std::vector<std::pair<uint64_t, uint64_t> > &minimizers,
      const std::vector<Candidate> &positive_candidates,
      const std::vector<Candidate> &negative_candidates,
      std::vector<std::pair<int, uint64_t> > *positive_mappings,
      std::vector<int> *positive_split_sites,
      std::vector<std::pair<int, uint64_t> > *negative_mappings,
      std::vector<int> *negative_split_sites, int *min_num_errors,
      int *num_best_mappings, int *second_min_num_errors,
      int *num_second_best_mappings);
                        MappingMetadata &mapping_metadata);
  void GenerateMDTag(const char *pattern, const char *text,
                     int mapping_start_position, int n_cigar,
                     const uint32_t *cigar, int &NM, std::string &MD_tag);
+12 −7
Original line number Diff line number Diff line
@@ -749,13 +749,18 @@ int Index::GenerateCandidatesFromRepetitiveReadWithMateInfo(
  return max_count;
}

void Index::GenerateCandidates(
    int error_threshold,
    const std::vector<std::pair<uint64_t, uint64_t> > &minimizers,
    uint32_t &repetitive_seed_length, std::vector<uint64_t> &positive_hits,
    std::vector<uint64_t> &negative_hits,
    std::vector<Candidate> &positive_candidates,
    std::vector<Candidate> &negative_candidates) const {
void Index::GenerateCandidates(int error_threshold,
                               MappingMetadata &mapping_metadata) const {
  const std::vector<std::pair<uint64_t, uint64_t> > &minimizers =
      mapping_metadata.minimizers_;
  std::vector<uint64_t> &positive_hits = mapping_metadata.positive_hits_;
  std::vector<uint64_t> &negative_hits = mapping_metadata.negative_hits_;
  std::vector<Candidate> &positive_candidates =
      mapping_metadata.positive_candidates_;
  std::vector<Candidate> &negative_candidates =
      mapping_metadata.negative_candidates_;
  uint32_t &repetitive_seed_length = mapping_metadata.repetitive_seed_length_;

  repetitive_seed_length = 0;
  // bool recollect = true;
  int repetitive_seed_count = CollectCandidates(
+3 −18
Original line number Diff line number Diff line
@@ -6,6 +6,7 @@
#include <vector>

#include "khash.h"
#include "mapping_metadata.h"
#include "sequence_batch.h"

//#define LI_DEBUG
@@ -20,17 +21,6 @@ enum Direction {
  kNegative,
};

struct Candidate {
  uint64_t position;
  uint8_t count;
  bool operator<(const Candidate &c) const {
    if (count != c.count)
      return count > c.count;
    else
      return position < c.position;
  }
};

struct mmHit {
  uint32_t mi;
  uint64_t position;
@@ -86,13 +76,8 @@ class Index {
  void GenerateCandidatesOnOneDirection(
      int error_threshold, int num_seeds_required, uint32_t num_minimizers,
      std::vector<uint64_t> &hits, std::vector<Candidate> &candidates) const;
  void GenerateCandidates(
      int error_threshold,
      const std::vector<std::pair<uint64_t, uint64_t> > &minimizers,
      uint32_t &repetitive_seed_length, std::vector<uint64_t> &positive_hits,
      std::vector<uint64_t> &negative_hits,
      std::vector<Candidate> &positive_candidates,
      std::vector<Candidate> &negative_candidates) const;
  void GenerateCandidates(int error_threshold,
                          MappingMetadata &mapping_metadata) const;
  int GenerateCandidatesFromRepetitiveReadWithMateInfo(
      int error_threshold,
      const std::vector<std::pair<uint64_t, uint64_t> > &minimizers,

src/mapping_buffer.h

deleted100644 → 0
+0 −45
Original line number Diff line number Diff line
#ifndef MAPPINGBUFFER_H_
#define MAPPINGBUFFER_H_

#include <utility>
#include <vector>

namespace chromap {
struct MappingBuffer {
  std::vector<std::pair<uint64_t, uint64_t>> minimizers1;
  std::vector<std::pair<uint64_t, uint64_t>> minimizers2;

  std::vector<uint64_t> positive_hits1;
  std::vector<uint64_t> positive_hits2;
  std::vector<uint64_t> negative_hits1;
  std::vector<uint64_t> negative_hits2;

  std::vector<Candidate> positive_candidates1;
  std::vector<Candidate> positive_candidates2;
  std::vector<Candidate> negative_candidates1;
  std::vector<Candidate> negative_candidates2;

  std::vector<Candidate> positive_candidates1_buffer;
  std::vector<Candidate> positive_candidates2_buffer;
  std::vector<Candidate> negative_candidates1_buffer;
  std::vector<Candidate> negative_candidates2_buffer;

  std::vector<std::pair<int, uint64_t>> positive_mappings1;
  std::vector<std::pair<int, uint64_t>> positive_mappings2;
  std::vector<std::pair<int, uint64_t>> negative_mappings1;
  std::vector<std::pair<int, uint64_t>> negative_mappings2;

  std::vector<int> positive_split_sites1;
  std::vector<int> negative_split_sites1;
  std::vector<int> positive_split_sites2;
  std::vector<int> negative_split_sites2;

  std::vector<std::pair<uint32_t, uint32_t>> F1R2_best_mappings;
  std::vector<std::pair<uint32_t, uint32_t>> F2R1_best_mappings;
  std::vector<std::pair<uint32_t, uint32_t>> F1F2_best_mappings;
  std::vector<std::pair<uint32_t, uint32_t>> R1R2_best_mappings;
};

}  // namespace chromap

#endif  // MAPPINGBUFFER_H_
Loading