Commit 5b72b44a authored by Haowen Zhang's avatar Haowen Zhang
Browse files

Support quick peak calling

parent 66adea22
Loading
Loading
Loading
Loading
+185 −22
Original line number Diff line number Diff line
@@ -185,16 +185,71 @@ void Chromap<MappingRecord>::CorrectBarcodeAt(uint32_t barcode_index, SequenceBa
  }
}

template <typename MappingRecord>
uint32_t Chromap<MappingRecord>::CallPeaks(uint16_t coverage_threshold, uint32_t num_reference_sequences, const SequenceBatch &reference) {
  return 0;
}
 
template <>
uint32_t Chromap<PairedEndMappingWithBarcode>::CallPeaks(uint16_t coverage_threshold, uint32_t num_reference_sequences, const SequenceBatch &reference) {
  double real_start_time = GetRealTime();
  std::vector<std::vector<PairedEndMappingWithBarcode> > &mappings = allocate_multi_mappings_ ? allocated_mappings_on_diff_ref_seqs_ : (remove_pcr_duplicates_ ? deduped_mappings_on_diff_ref_seqs_ : mappings_on_diff_ref_seqs_);
  // Build pileup
  for (uint32_t ri = 0; ri < num_reference_sequences; ++ri) {
    pileup_on_diff_ref_seqs_.emplace_back(std::vector<uint16_t>());
    pileup_on_diff_ref_seqs_[ri].assign(reference.GetSequenceLengthAt(ri), 0);
    for (size_t mi = 0; mi < mappings[ri].size(); ++mi) {
      for (uint16_t pi = 0; pi < mappings[ri][mi].fragment_length; ++pi) {
        ++pileup_on_diff_ref_seqs_[ri][mappings[ri][mi].fragment_start_position + pi];
      }
    }
  }
  std::cerr << "Built pileup in " << Chromap<>::GetRealTime() - real_start_time << "s.\n";
  real_start_time = GetRealTime();
  // Call and save peaks
  tree_extras_on_diff_ref_seqs_.clear();
  tree_info_on_diff_ref_seqs_.clear();
  tree_extras_on_diff_ref_seqs_.reserve(num_reference_sequences);
  tree_info_on_diff_ref_seqs_.reserve(num_reference_sequences);
  uint32_t peak_count = 0; 
  for (uint32_t ri = 0; ri < num_reference_sequences; ++ri) {
    tree_extras_on_diff_ref_seqs_.emplace_back(std::vector<uint32_t>());
    tree_extras_on_diff_ref_seqs_[ri].reserve(reference.GetSequenceLengthAt(ri) / 100);
    peaks_on_diff_ref_seqs_.emplace_back(std::vector<Peak>());
    uint32_t peak_start_position = 0;
    uint16_t peak_length = 0;
    for (size_t pi = 0; pi < reference.GetSequenceLengthAt(ri); ++pi) {
      if (pileup_on_diff_ref_seqs_[ri][pi] >= coverage_threshold) {
        if (peak_length == 0) { // start a new peak
          peak_start_position = pi;
        }
        ++peak_length; // extend the peak
      } else if (peak_length > 0) { // save the previous peak
        peaks_on_diff_ref_seqs_[ri].emplace_back(Peak{peak_start_position, peak_length, peak_count}); 
        tree_extras_on_diff_ref_seqs_[ri].emplace_back(0);
        output_tools_->OutputPeaks(peak_start_position, peak_length, ri, reference);
        ++peak_count;
        peak_length = 0;
      }
    }
    BuildAugmentedTreeForPeaks(ri);
  }
  std::cerr << "Call peaks and built peak augmented tree in " << Chromap<>::GetRealTime() - real_start_time << "s.\n";
  // Output feature matrix
  return peak_count;
}

template <typename MappingRecord>
void Chromap<MappingRecord>::OutputFeatureMatrix(uint32_t num_sequences, const SequenceBatch &reference) {
}

template <>
void Chromap<PairedEndMappingWithBarcode>::OutputFeatureMatrix(uint32_t num_sequences, const SequenceBatch &reference) {
  uint32_t bin_size = 5000;
  output_tools_->InitializeMatrixOutput(matrix_output_prefix_);
  output_tools_->OutputPeaks(bin_size, num_sequences, reference);
  //uint32_t bin_size = 5000;
  //output_tools_->OutputPeaks(bin_size, num_sequences, reference);
  uint32_t num_peaks = CallPeaks(3, num_sequences, reference);
  std::vector<std::vector<PairedEndMappingWithBarcode> > &mappings = allocate_multi_mappings_ ? allocated_mappings_on_diff_ref_seqs_ : (remove_pcr_duplicates_ ? deduped_mappings_on_diff_ref_seqs_ : mappings_on_diff_ref_seqs_);
  double real_start_time = GetRealTime();
  // First pass to index barcodes
  uint32_t barcode_index = 0;
  for (uint32_t rid = 0; rid < num_sequences; ++rid) {
@@ -211,15 +266,22 @@ void Chromap<PairedEndMappingWithBarcode>::OutputFeatureMatrix(uint32_t num_sequ
      }
    }
  }
  std::cerr << "Index and output barcodes in " << Chromap<>::GetRealTime() - real_start_time << "s.\n";
  real_start_time = GetRealTime();
  // Second pass to generate matrix
  khash_t(kmatrix) *matrix = kh_init(kmatrix);
  std::vector<uint32_t> overlapped_peak_indices;
  for (uint32_t rid = 0; rid < num_sequences; ++rid) {
    for (uint32_t mi = 0; mi < mappings[rid].size(); ++mi) {
      uint32_t barcode_key = mappings[rid][mi].cell_barcode;
      khiter_t barcode_index_table_iterator = kh_get(k32, barcode_index_table_, barcode_key);
      uint64_t barcode_index = kh_value(barcode_index_table_, barcode_index_table_iterator);
      uint32_t peak_index = Position2PeakIndex(bin_size, rid, mappings[rid][mi].fragment_start_position, num_sequences, reference);
      uint64_t entry_index = (barcode_index << 32) | peak_index;
      //uint32_t peak_index = Position2PeakIndex(bin_size, rid, mappings[rid][mi].fragment_start_position, num_sequences, reference);
      overlapped_peak_indices.clear();
      GetNumOverlappedPeaks(rid, mappings[rid][mi], overlapped_peak_indices);
      for (size_t pi = 0; pi < overlapped_peak_indices.size(); ++pi) {
        //uint64_t entry_index = (barcode_index << 32) | peak_index;
        uint64_t entry_index = (barcode_index << 32) | overlapped_peak_indices[pi];
        khiter_t matrix_iterator = kh_get(kmatrix, matrix, entry_index);
        if (matrix_iterator == kh_end(matrix)) {
          int khash_return_code;
@@ -231,21 +293,24 @@ void Chromap<PairedEndMappingWithBarcode>::OutputFeatureMatrix(uint32_t num_sequ
        }
      }
    }
  // Output matrix
  uint32_t num_peaks = 0;
  for (uint32_t i = 0; i < num_sequences; ++i) {
    uint32_t ref_seq_length = reference.GetSequenceLengthAt(i);
    num_peaks += ref_seq_length / bin_size;
    if (ref_seq_length % bin_size != 0) {
      ++num_peaks;
    }
  }
  std::cerr << "Generate feature matrix in " << Chromap<>::GetRealTime() - real_start_time << "s.\n";
  // Output matrix
  //uint32_t num_peaks = 0;
  //for (uint32_t i = 0; i < num_sequences; ++i) {
  //  uint32_t ref_seq_length = reference.GetSequenceLengthAt(i);
  //  num_peaks += ref_seq_length / bin_size;
  //  if (ref_seq_length % bin_size != 0) {
  //    ++num_peaks;
  //  }
  //}
  real_start_time = GetRealTime();
  output_tools_->WriteMatrixOutputHead(num_peaks, kh_size(barcode_index_table_), kh_size(matrix));
  uint64_t key;
  uint32_t value;
  kh_foreach(matrix, key, value, output_tools_->AppendMatrixOutput((uint32_t)key, (uint32_t)(key >> 32), value));
  kh_destroy(kmatrix, matrix);
  output_tools_->FinalizeMatrixOutput();
  std::cerr << "Output feature matrix in " << Chromap<>::GetRealTime() - real_start_time << "s.\n";
}

template <typename MappingRecord>
@@ -260,6 +325,89 @@ uint32_t Chromap<MappingRecord>::Position2PeakIndex(uint32_t bin_size, uint32_t
  return peak_index;
}

template <typename MappingRecord>
void Chromap<MappingRecord>::BuildAugmentedTreeForPeaks(uint32_t ref_id) {
  //std::sort(mappings.begin(), mappings.end(), IntervalLess());
  int max_level = 0;
  size_t i, last_i = 0; // last_i points to the rightmost node in the tree
  uint32_t last = 0; // last is the max value at node last_i
  int k;
  std::vector<Peak> &peaks = peaks_on_diff_ref_seqs_[ref_id];
  std::vector<uint32_t> &extras = tree_extras_on_diff_ref_seqs_[ref_id];
  if (peaks.size() == 0) {
    max_level = -1;
  }
  for (i = 0; i < peaks.size(); i += 2) { 
    last_i = i; 
    //last = mappings[i].max = mappings[i].en; // leaves (i.e. at level 0)
    last = extras[i] = peaks[i].start_position + peaks[i].length; // leaves (i.e. at level 0)
  }
  for (k = 1; 1LL<<k <= (int64_t)peaks.size(); ++k) { // process internal nodes in the bottom-up order
    size_t x = 1LL<<(k-1);
    size_t i0 = (x<<1) - 1;
    size_t step = x<<2; // i0 is the first node
    for (i = i0; i < peaks.size(); i += step) { // traverse all nodes at level k
      uint32_t el = extras[i - x];                          // max value of the left child
      uint32_t er = i + x < peaks.size()? extras[i + x] : last; // of the right child
      uint32_t e = peaks[i].start_position + peaks[i].length;
      e = e > el? e : el;
      e = e > er? e : er;
      extras[i] = e; // set the max value for node i
    }
    last_i = last_i>>k&1? last_i - x : last_i + x; // last_i now points to the parent of the original last_i
    if (last_i < peaks.size() && extras[last_i] > last) // update last accordingly
      last = extras[last_i];
  }
  max_level = k - 1;
  tree_info_on_diff_ref_seqs_.emplace_back(max_level, peaks.size());
}

template <typename MappingRecord>
uint32_t Chromap<MappingRecord>::GetNumOverlappedPeaks(uint32_t ref_id, const MappingRecord &mapping, std::vector<uint32_t> &overlapped_peak_indices) {
  int t = 0;
  StackCell stack[64];
  //out.clear();
  overlapped_peak_indices.clear();
  int num_overlapped_peaks = 0;
  int max_level = tree_info_on_diff_ref_seqs_[ref_id].first;
  uint32_t num_tree_nodes = tree_info_on_diff_ref_seqs_[ref_id].second;
  std::vector<Peak> &peaks = peaks_on_diff_ref_seqs_[ref_id];
  std::vector<uint32_t> &extras = tree_extras_on_diff_ref_seqs_[ref_id];
  //uint32_t interval_start = mapping.fragment_start_position; 
  uint32_t interval_start = mapping.fragment_start_position > (uint32_t)multi_mapping_allocation_distance_ ? mapping.fragment_start_position - multi_mapping_allocation_distance_ : 0;
  uint32_t interval_end = mapping.fragment_start_position + mapping.fragment_length + (uint32_t)multi_mapping_allocation_distance_;
  stack[t++] = StackCell(max_level, (1LL<<max_level) - 1, 0); // push the root; this is a top down traversal
  while (t) { // the following guarantees that numbers in out[] are always sorted
    StackCell z = stack[--t];
    if (z.k <= 3) { // we are in a small subtree; traverse every node in this subtree
      size_t i, i0 = z.x >> z.k << z.k, i1 = i0 + (1LL<<(z.k+1)) - 1;
      if (i1 >= num_tree_nodes) {
        i1 = num_tree_nodes;
      }
      for (i = i0; i < i1 && peaks[i].start_position < interval_end; ++i) {
        if (interval_start < peaks[i].start_position + peaks[i].length) { // if overlap, append to out[]
          //out.push_back(i);
          overlapped_peak_indices.emplace_back(peaks[i].index);
          ++num_overlapped_peaks;
        }
      }
    } else if (z.w == 0) { // if left child not processed
      size_t y = z.x - (1LL<<(z.k-1)); // the left child of z.x; NB: y may be out of range (i.e. y>=a.size())
      stack[t++] = StackCell(z.k, z.x, 1); // re-add node z.x, but mark the left child having been processed
      if (y >= num_tree_nodes || extras[y] > interval_start) // push the left child if y is out of range or may overlap with the query
        stack[t++] = StackCell(z.k - 1, y, 0);
    } else if (z.x < num_tree_nodes && peaks[z.x].start_position < interval_end) { // need to push the right child
      if (interval_start < peaks[z.x].start_position + peaks[z.x].length) {
        //out.push_back(z.x); // test if z.x overlaps the query; if yes, append to out[]
        overlapped_peak_indices.emplace_back(peaks[z.x].index);
        ++num_overlapped_peaks;
      }
      stack[t++] = StackCell(z.k - 1, z.x + (1LL<<(z.k-1)), 0); // push the right child
    }
  }
  return num_overlapped_peaks;
}

template <typename MappingRecord>
uint32_t Chromap<MappingRecord>::LoadPairedEndReadsWithBarcodes(SequenceBatch *read_batch1, SequenceBatch *read_batch2, SequenceBatch *barcode_batch) {
  double real_start_time = Chromap<>::GetRealTime();
@@ -529,7 +677,9 @@ void Chromap<MappingRecord>::MapPairedEndReads() {
    OutputMappings(num_reference_sequences, reference, mappings);
  }
  if (!is_bulk_data_ && !matrix_output_prefix_.empty()) {
    output_tools_->InitializeMatrixOutput(matrix_output_prefix_);
    OutputFeatureMatrix(num_reference_sequences, reference);
    output_tools_->FinalizeMatrixOutput();
  }
  output_tools_->FinalizeMappingOutput();
  reference.FinalizeLoading();
@@ -1611,6 +1761,19 @@ void Chromap<MappingRecord>::VerifyCandidates(const SequenceBatch &read_batch, u

template <typename MappingRecord>
int Chromap<MappingRecord>::BandedAlignPatternToText(const char *pattern, const char *text, const int read_length, int *mapping_end_position) {
  //int error_count = 0;
  //for (int i = 0; i < read_length; ++i) {
  //  if (pattern[i + error_threshold_] != text[i]) {
  //    ++error_count;
  //    if (error_count > 1) {
  //      break;
  //    }
  //  } 
  //}
  //if (error_count <= 1) {
  //  *mapping_end_position = read_length - 1 + error_threshold_;
  //  return error_count;
  //}
  uint32_t Peq[5] = {0, 0, 0, 0, 0};
  for (int i = 0; i < 2 * error_threshold_; i++) {
    uint8_t base = SequenceBatch::CharToUint8(pattern[i]);
+13 −1
Original line number Diff line number Diff line
@@ -28,6 +28,12 @@ struct StackCell {
  StackCell(int k_, size_t x_, int w_) : x(x_), k(k_), w(w_) {};
};

struct Peak {
  uint32_t start_position;
  uint16_t length;
  uint32_t index;
};

KHASH_MAP_INIT_INT64(k128, uint128_t);
KHASH_MAP_INIT_INT(k32, uint32_t);
KHASH_SET_INIT_INT(k32_set);
@@ -150,8 +156,11 @@ class Chromap {
  uint32_t GetNumOverlappedMappings(uint32_t ref_id, const MappingRecord &mapping);
  void LoadBarcodeWhitelist();
  void CorrectBarcodeAt(uint32_t barcode_index, SequenceBatch *barcode_batch, uint64_t *num_barcode_in_whitelist, uint64_t *num_corrected_barcode);
  uint32_t CallPeaks(uint16_t coverage_threshold, uint32_t num_reference_sequences, const SequenceBatch &reference);
  void OutputFeatureMatrix(uint32_t num_sequences, const SequenceBatch &reference);
  uint32_t Position2PeakIndex(uint32_t bin_size, uint32_t rid, uint32_t position, uint32_t num_sequences, const SequenceBatch &reference);
  uint32_t GetNumOverlappedPeaks(uint32_t ref_id, const MappingRecord &mapping, std::vector<uint32_t> &overlapped_peak_indices);
  void BuildAugmentedTreeForPeaks(uint32_t ref_id);
  void OutputMappingsInVector(uint8_t mapq_threshold, uint32_t num_reference_sequences, const SequenceBatch &reference, const std::vector<std::vector<MappingRecord> > &mappings);
  void OutputMappings(uint32_t num_reference_sequences, const SequenceBatch &reference, const std::vector<std::vector<MappingRecord> > &mappings);
  inline static double GetRealTime() {
@@ -220,7 +229,7 @@ class Chromap {
  std::vector<std::vector<MappingRecord> > deduped_mappings_on_diff_ref_seqs_;
  std::vector<std::pair<uint32_t, MappingRecord> > multi_mappings_;
  std::vector<std::vector<MappingRecord> > allocated_mappings_on_diff_ref_seqs_;
  std::vector<std::vector<uint32_t> > tree_extras_on_diff_ref_seqs_;
  std::vector<std::vector<uint32_t> > tree_extras_on_diff_ref_seqs_; // max
  std::vector<std::pair<int, uint32_t> > tree_info_on_diff_ref_seqs_; // (max_level, # nodes)
  std::unique_ptr<OutputTools<MappingRecord> > output_tools_;
  // For mapping stats
@@ -235,6 +244,9 @@ class Chromap {
  uint64_t num_corrected_barcode_ = 0;
  khash_t(k32)* barcode_histogram_;
  khash_t(k32)* barcode_index_table_;
  // For peak calling
  std::vector<std::vector<uint16_t> > pileup_on_diff_ref_seqs_;
  std::vector<std::vector<Peak> > peaks_on_diff_ref_seqs_;
};
} // namespace chromap

+4 −0
Original line number Diff line number Diff line
@@ -165,6 +165,10 @@ class OutputTools {
      }
    } 
  }
  void OutputPeaks(uint32_t peak_start_position, uint16_t peak_length, uint32_t rid, const SequenceBatch &reference) {
    const char *sequence_name = reference.GetSequenceNameAt(rid);
    fprintf(peak_output_file_, "%s\t%u\t%u\n", sequence_name, peak_start_position + 1, peak_start_position + peak_length);
  }
  void AppendBarcodeOutput(uint32_t barcode_key) {
    fprintf(barcode_output_file_, "%s-1\n", Seed2Sequence(barcode_key, cell_barcode_length_).data());
  }