Commit b1120d68 authored by Haowen Zhang's avatar Haowen Zhang
Browse files

Fix multiple bugs

parent 31a02b34
Loading
Loading
Loading
Loading
+99 −27
Original line number Diff line number Diff line
@@ -563,25 +563,39 @@ void Chromap<MappingRecord>::SupplementCandidates(const Index &index, uint32_t r
    uint32_t mm_count = minimizers->size();
    bool augment_flag = true;
    uint32_t candidate_num = positive_candidates->size();
    if (positive_candidates->size() >= (uint32_t)max_seed_frequencies_[0]) {
      augment_flag = false;
    } else {
      for (uint32_t i = 0; i < candidate_num; ++i) {
      if (positive_candidates->at(i).count >= mm_count / 2)
        if (positive_candidates->at(i).count >= mm_count / 2) {
          augment_flag = false;
          break;
        }
      }
    }
    candidate_num = negative_candidates->size();
    if (augment_flag) {
      if (negative_candidates->size() >= (uint32_t)max_seed_frequencies_[0]) {
        augment_flag = false;
      } else {
        for (uint32_t i = 0; i < candidate_num; ++i) {
        if (negative_candidates->at(i).count >= mm_count / 2)
          if (negative_candidates->at(i).count >= mm_count / 2) {
            augment_flag = false;
            break;
          }
        }
      }
    }
    if (augment_flag) {
      positive_hits->clear();
      negative_hits->clear();
      if (mate_positive_candidates->size() > 0) {
        index.GenerateCandidatesFromRepetitiveReadWithMateInfo(error_threshold_, *minimizers, repetitive_seed_length, negative_hits, augment_negative_candidates, mate_positive_candidates, -1, 2 * max_insert_size_);
      if (mate_positive_candidates->size() > 0 && mate_positive_candidates->size() <= (uint32_t)max_seed_frequencies_[0]) {
        //std::cerr << "Supplement positive" << "\n";
        index.GenerateCandidatesFromRepetitiveReadWithMateInfo(error_threshold_, *minimizers, repetitive_seed_length, negative_hits, augment_negative_candidates, mate_positive_candidates, kNegative, 2 * max_insert_size_);
      }
      if (mate_negative_candidates->size() > 0) {
        index.GenerateCandidatesFromRepetitiveReadWithMateInfo(error_threshold_, *minimizers, repetitive_seed_length, positive_hits, augment_positive_candidates, mate_negative_candidates, 1, 2 * max_insert_size_);
      if (mate_negative_candidates->size() > 0 && mate_negative_candidates->size() <= (uint32_t)max_seed_frequencies_[0]) {
        //std::cerr << "Supplement negative" << "\n";
        index.GenerateCandidatesFromRepetitiveReadWithMateInfo(error_threshold_, *minimizers, repetitive_seed_length, positive_hits, augment_positive_candidates, mate_negative_candidates, kPositive, 2 * max_insert_size_);
      }
    }
  }
@@ -757,6 +771,14 @@ void Chromap<MappingRecord>::MapPairedEndReads() {
            minimizers2.reserve(read_batch2.GetSequenceLengthAt(pair_index) / window_size_ * 2);
            index.GenerateMinimizerSketch(read_batch1, pair_index, &minimizers1);
            index.GenerateMinimizerSketch(read_batch2, pair_index, &minimizers2);
            //std::cerr << "m1" << " " << minimizers1.size() << "\n";
            //for (auto &mi : minimizers1) {
            //  std::cerr << (mi.second >> 33) << " " << (uint32_t) (mi.second >> 1) << "\n";
            //}
            //std::cerr << "m2" << " " << minimizers2.size() << "\n";
            //for (auto &mi : minimizers2) {
            //  std::cerr << (mi.second >> 33) << " " << (uint32_t) (mi.second >> 1) << "\n";
            //}
            if (minimizers1.size() != 0 && minimizers2.size() != 0) {
              positive_hits1.clear();
              positive_hits2.clear();
@@ -792,7 +814,27 @@ void Chromap<MappingRecord>::MapPairedEndReads() {
                mm_history2[pair_index].repetitive_seed_length = repetitive_seed_length2;
              }
              // Test whether we need to augment the candidate list with mate information.
              //std::cerr << "before supplement" << "\n";
              //std::cerr << "p1" << "\n";
              //for (auto &ci : positive_candidates1) {
              //  std::cerr << (ci.position >> 32) << " " << (uint32_t) ci.position << " " << (int)ci.count << "\n";
              //}
              //std::cerr << "n1" << "\n";
              //for (auto &ci : negative_candidates1) {
              //  std::cerr << (ci.position >> 32) << " " << (uint32_t) ci.position << " " << (int)ci.count << "\n";
              //}
              //std::cerr << "p2" << "\n";
              //for (auto &ci : positive_candidates2) {
              //  std::cerr << (ci.position >> 32) << " " << (uint32_t) ci.position << " " << (int)ci.count << "\n";
              //}
              //std::cerr << "n2" << "\n";
              //for (auto &ci : negative_candidates2) {
              //  std::cerr << (ci.position >> 32) << " " << (uint32_t) ci.position << " " << (int)ci.count << "\n";
              //}
              //if ((positive_candidates1.size() == 0 && negative_candidates1.size() == 0) || (positive_candidates2.size() == 0 && negative_candidates2.size() == 0)) {
              //if ((positive_candidates1.size() < 5000 && negative_candidates1.size() < 5000) || (positive_candidates2.size() == 0 && negative_candidates2.size() == 0)) {
                SupplementCandidates(index, repetitive_seed_length1, repetitive_seed_length2, minimizers1, minimizers2, positive_hits1, positive_hits2, positive_candidates1, positive_candidates2, positive_candidates1_buffer, positive_candidates2_buffer, negative_hits1, negative_hits2, negative_candidates1, negative_candidates2, negative_candidates1_buffer, negative_candidates2_buffer);
              //}
              current_num_candidates1 = positive_candidates1.size() + negative_candidates1.size();
              current_num_candidates2 = positive_candidates2.size() + negative_candidates2.size();
              if (current_num_candidates1 > 0 && current_num_candidates2 > 0) {
@@ -805,7 +847,41 @@ void Chromap<MappingRecord>::MapPairedEndReads() {
                negative_candidates1.clear();
                negative_candidates2.clear();
                // Paired-end filter
                //std::cerr << "p1" << "\n";
                //for (auto &ci : positive_candidates1_buffer) {
                //  std::cerr << (ci.position >> 32) << " " << (uint32_t) ci.position << " " << (int)ci.count << "\n";
                //}
                //std::cerr << "n1" << "\n";
                //for (auto &ci : negative_candidates1_buffer) {
                //  std::cerr << (ci.position >> 32) << " " << (uint32_t) ci.position << " " << (int)ci.count << "\n";
                //}
                //std::cerr << "p2" << "\n";
                //for (auto &ci : positive_candidates2_buffer) {
                //  std::cerr << (ci.position >> 32) << " " << (uint32_t) ci.position << " " << (int)ci.count << "\n";
                //}
                //std::cerr << "n2" << "\n";
                //for (auto &ci : negative_candidates2_buffer) {
                //  std::cerr << (ci.position >> 32) << " " << (uint32_t) ci.position << " " << (int)ci.count << "\n";
                //}
                //std::cerr << "#pc1: " << positive_candidates1_buffer.size() << ", #nc1: " << negative_candidates1_buffer.size() << ", #pc2: " << positive_candidates2_buffer.size() << ", #nc2: " << negative_candidates2_buffer.size() << "\n";
                ReduceCandidatesForPairedEndRead(positive_candidates1_buffer, negative_candidates1_buffer, positive_candidates2_buffer, negative_candidates2_buffer, &positive_candidates1, &negative_candidates1, &positive_candidates2, &negative_candidates2);
                //std::cerr << "p1" << "\n";
                //for (auto &ci : positive_candidates1) {
                //  std::cerr << (ci.position >> 32) << " " << (uint32_t) ci.position << " " << (int)ci.count << "\n";
                //}
                //std::cerr << "n1" << "\n";
                //for (auto &ci : negative_candidates1) {
                //  std::cerr << (ci.position >> 32) << " " << (uint32_t) ci.position << " " << (int)ci.count << "\n";
                //}
                //std::cerr << "p2" << "\n";
                //for (auto &ci : positive_candidates2) {
                //  std::cerr << (ci.position >> 32) << " " << (uint32_t) ci.position << " " << (int)ci.count << "\n";
                //}
                //std::cerr << "n2" << "\n";
                //for (auto &ci : negative_candidates2) {
                //  std::cerr << (ci.position >> 32) << " " << (uint32_t) ci.position << " " << (int)ci.count << "\n";
                //}
                //std::cerr << "After pe filter, #pc1: " << positive_candidates1.size() << ", #nc1: " << negative_candidates1.size() << ", #pc2: " << positive_candidates2.size() << ", #nc2: " << negative_candidates2.size() << "\n";
                current_num_candidates1 = positive_candidates1.size() + negative_candidates1.size();
                current_num_candidates2 = positive_candidates2.size() + negative_candidates2.size();
                // Verify candidates
@@ -819,12 +895,8 @@ void Chromap<MappingRecord>::MapPairedEndReads() {
                  int num_best_mappings1, num_second_best_mappings1;
                  int min_num_errors2, second_min_num_errors2;
                  int num_best_mappings2, num_second_best_mappings2;
                  //std::sort(positive_candidates1.begin(), positive_candidates1.end());
                  //std::sort(negative_candidates1.begin(), negative_candidates1.end());
                  VerifyCandidates(read_batch1, pair_index, reference, minimizers1, positive_candidates1, negative_candidates1, &positive_mappings1, &negative_mappings1, &min_num_errors1, &num_best_mappings1, &second_min_num_errors1, &num_second_best_mappings1);
                  uint32_t current_num_mappings1 = positive_mappings1.size() + negative_mappings1.size();
                  //std::sort(positive_candidates2.begin(), positive_candidates2.end());
                  //std::sort(negative_candidates2.begin(), negative_candidates2.end());
                  VerifyCandidates(read_batch2, pair_index, reference, minimizers2, positive_candidates2, negative_candidates2, &positive_mappings2, &negative_mappings2, &min_num_errors2, &num_best_mappings2, &second_min_num_errors2, &num_second_best_mappings2);
                  uint32_t current_num_mappings2 = positive_mappings2.size() + negative_mappings2.size();
                  if (current_num_mappings1 > 0 && current_num_mappings2 > 0) {
@@ -2000,9 +2072,9 @@ void Chromap<MappingRecord>::AllocateMultiMappings(uint32_t num_reference_sequen
}

template <typename MappingRecord>
void Chromap<MappingRecord>::MergeCandidates(std::vector<Candidate> &c1, const std::vector<Candidate> &c2, std::vector<Candidate> &buffer) {
void Chromap<MappingRecord>::MergeCandidates(std::vector<Candidate> &c1, std::vector<Candidate> &c2, std::vector<Candidate> &buffer) {
  if (c1.size() == 0) {
    c1 = c2;
    c1.swap(c2);
    return;
  }
  uint32_t i, j; 
@@ -2020,17 +2092,16 @@ void Chromap<MappingRecord>::MergeCandidates(std::vector<Candidate> &c1, const s
  i = 0; j = 0;
  while (i < size1 && j < size2) {
    if (c1[i].position == c2[j].position) {
      if (c1[i].count > c2[j].count)
      if (c1[i].count > c2[j].count) {
        buffer.push_back(c1[i]);
      else
      } else {
        buffer.push_back(c2[j]);
      ++i, ++j;
      }
    else if (c1[i].position < c2[j].position) {
      ++i, ++j;
    } else if (c1[i].position < c2[j].position) {
      buffer.push_back(c1[i]);
      ++i;
    }
    else {
    } else {
      buffer.push_back(c2[j]);
      ++j;
    }
@@ -2043,7 +2114,8 @@ void Chromap<MappingRecord>::MergeCandidates(std::vector<Candidate> &c1, const s
    buffer.push_back(c2[j]);
    ++j;
  }
  c1 = buffer;
  //c1 = buffer;
  c1.swap(buffer);
}

template <typename MappingRecord>
+1 −6
Original line number Diff line number Diff line
@@ -58,11 +58,6 @@ struct BarcodeWithQual {
  }
};

enum Direction {
  kPositive,
  kNegative,
};

#define SortMappingWithoutBarcode(m) (((((m).fragment_start_position<<16)|(m).fragment_length)<<8)|(m).mapq)
//#define SortMappingWithoutBarcode(m) (m)

@@ -151,7 +146,7 @@ class Chromap {
  void BandedAlign4PatternsToText(const char **patterns, const char *text, int read_length, int32_t *mapping_edit_distances, int32_t *mapping_end_positions);
  void BandedAlign8PatternsToText(const char **patterns, const char *text, int read_length, int16_t *mapping_edit_distances, int16_t *mapping_end_positions);
  void BandedTraceback(int min_num_errors, const char *pattern, const char *text, const int read_length, int *mapping_start_position);
  void MergeCandidates(std::vector<Candidate> &c1, const std::vector<Candidate> &c2, std::vector<Candidate> &buffer);
  void MergeCandidates(std::vector<Candidate> &c1, std::vector<Candidate> &c2, std::vector<Candidate> &buffer);
  void SupplementCandidates(const Index &index, uint32_t repetitive_seed_length1, uint32_t repetitive_seed_length2, std::vector<std::pair<uint64_t, uint64_t> > &minimizers1, std::vector<std::pair<uint64_t, uint64_t> > &minimizers2, std::vector<uint64_t> &positive_hits1, std::vector<uint64_t> &positive_hits2, std::vector<Candidate> &positive_candidates1, std::vector<Candidate> &positive_candidates2, std::vector<Candidate> &positive_candidates1_buffer, std::vector<Candidate> &positive_candidates2_buffer, std::vector<uint64_t> &negative_hits1, std::vector<uint64_t> &negative_hits2, std::vector<Candidate> &negative_candidates1, std::vector<Candidate> &negative_candidates2, std::vector<Candidate> &negative_candidates1_buffer, std::vector<Candidate> &negative_candidates2_buffer);
  void VerifyCandidatesOnOneDirectionUsingSIMD(Direction candidate_direction, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<Candidate> &candidates, std::vector<std::pair<int, uint64_t> > *mappings, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void VerifyCandidatesOnOneDirection(Direction candidate_direction, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<Candidate> &candidates, std::vector<std::pair<int, uint64_t> > *mappings, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
+161 −204

File changed.

Preview size limit exceeded, changes collapsed.

+6 −1
Original line number Diff line number Diff line
@@ -15,6 +15,11 @@ namespace chromap {
#define KHashEqForIndex(a, b) ((a)>>1 == (b)>>1)
KHASH_INIT(k64, uint64_t, uint64_t, 1, KHashFunctionForIndex, KHashEqForIndex);

enum Direction {
  kPositive,
  kNegative,
};

struct Candidate {
  uint64_t position;
  uint8_t count;
@@ -75,7 +80,7 @@ class Index {
  void Load();
  void GenerateCandidatesOnOneDirection(int error_threshold, int num_seeds_required, std::vector<uint64_t> *hits, std::vector<Candidate> *candidates) const;
  void GenerateCandidates(int error_threshold, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, uint32_t *repetitive_seed_length, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits, std::vector<Candidate> *positive_candidates, std::vector<Candidate> *negative_candidates) const;
  void GenerateCandidatesFromRepetitiveReadWithMateInfo(int error_threshold, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, uint32_t *repetitive_seed_length, std::vector<uint64_t> *hits, std::vector<Candidate> *candidates, std::vector<Candidate> *mate_candidates, int direction, unsigned int range) const;
  void GenerateCandidatesFromRepetitiveReadWithMateInfo(int error_threshold, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, uint32_t *repetitive_seed_length, std::vector<uint64_t> *hits, std::vector<Candidate> *candidates, std::vector<Candidate> *mate_candidates, Direction direction, uint32_t range) const;
  int CollectCandidates(int max_seed_frequency, int repetitive_seed_frequency, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, uint32_t *repetitive_seed_length, std::vector<uint64_t> *positive_hits, std::vector<uint64_t> *negative_hits, bool use_heap) const;
  inline static uint64_t Hash64(uint64_t key, const uint64_t mask) {
    key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1;