Commit 964f9814 authored by Haowen Zhang's avatar Haowen Zhang
Browse files

Output barcode stats

parent b6eaf37f
Loading
Loading
Loading
Loading
+20 −4
Original line number Diff line number Diff line
@@ -120,12 +120,13 @@ bool Chromap<MappingRecord>::PairedEndReadWithBarcodeIsDuplicate(uint32_t pair_i
}

template <typename MappingRecord>
void Chromap<MappingRecord>::CorrectBarcodeAt(uint32_t barcode_index, SequenceBatch *barcode_batch) {
void Chromap<MappingRecord>::CorrectBarcodeAt(uint32_t barcode_index, SequenceBatch *barcode_batch, uint64_t *num_barcode_in_whitelist, uint64_t *num_corrected_barcode) {
  uint32_t barcode_length = barcode_batch->GetSequenceLengthAt(barcode_index);
  uint32_t barcode_key = barcode_batch->GenerateSeedFromSequenceAt(barcode_index, 0, barcode_length);
  khiter_t barcode_whitelist_lookup_table_iterator = kh_get(k32_set, barcode_whitelist_lookup_table_, barcode_key);
  if (barcode_whitelist_lookup_table_iterator != kh_end(barcode_whitelist_lookup_table_)) {
    // Correct barcode, nothing to do
    ++(*num_barcode_in_whitelist);
  } else {
    // Need to correct this barcode
    //const char *barcode = barcode_batch->GetSequenceAt(barcode_index);
@@ -158,6 +159,7 @@ void Chromap<MappingRecord>::CorrectBarcodeAt(uint32_t barcode_index, SequenceBa
      // Just correct it
      //std::cerr << "Corrected the barcode from " << barcode << " to ";
      barcode_batch->CorrectBaseAt(barcode_index, corrected_barcodes_with_quals[0].corrected_base_index, corrected_barcodes_with_quals[0].correct_base);
      ++(*num_corrected_barcode);
      //std::cerr << barcode << "\n";
    } else {
      // Randomly select one of the best corrections
@@ -176,6 +178,7 @@ void Chromap<MappingRecord>::CorrectBarcodeAt(uint32_t barcode_index, SequenceBa
      }
      //std::cerr << "Corrected the barcode from " << barcode << " to ";
      barcode_batch->CorrectBaseAt(barcode_index, corrected_barcodes_with_quals[best_corrected_barcode_index].corrected_base_index, corrected_barcodes_with_quals[best_corrected_barcode_index].correct_base);
      ++(*num_corrected_barcode);
      //std::cerr << barcode << "\n";
    }
  }
@@ -275,8 +278,10 @@ void Chromap<MappingRecord>::MapPairedEndReads() {
  static uint64_t thread_num_mappings = 0;
  static uint64_t thread_num_mapped_reads = 0; 
  static uint64_t thread_num_uniquely_mapped_reads = 0; 
#pragma omp threadprivate(thread_num_candidates, thread_num_mappings, thread_num_mapped_reads, thread_num_uniquely_mapped_reads)
#pragma omp parallel default(none) shared(reference, index, read_batch1, read_batch2, barcode_batch, read_batch1_for_loading, read_batch2_for_loading, barcode_batch_for_loading, std::cerr, num_loaded_pairs_for_loading, num_loaded_pairs, num_reference_sequences, mappings_on_diff_ref_seqs_for_diff_threads, mappings_on_diff_ref_seqs_for_diff_threads_for_saving) num_threads(num_threads_) reduction(+:num_candidates_, num_mappings_, num_mapped_reads_, num_uniquely_mapped_reads_)
  static uint64_t thread_num_barcode_in_whitelist = 0; 
  static uint64_t thread_num_corrected_barcode = 0; 
#pragma omp threadprivate(thread_num_candidates, thread_num_mappings, thread_num_mapped_reads, thread_num_uniquely_mapped_reads, thread_num_barcode_in_whitelist, thread_num_corrected_barcode)
#pragma omp parallel default(none) shared(reference, index, read_batch1, read_batch2, barcode_batch, read_batch1_for_loading, read_batch2_for_loading, barcode_batch_for_loading, std::cerr, num_loaded_pairs_for_loading, num_loaded_pairs, num_reference_sequences, mappings_on_diff_ref_seqs_for_diff_threads, mappings_on_diff_ref_seqs_for_diff_threads_for_saving) num_threads(num_threads_) reduction(+:num_candidates_, num_mappings_, num_mapped_reads_, num_uniquely_mapped_reads_, num_barcode_in_whitelist_, num_corrected_barcode_)
  {
  std::vector<std::pair<uint64_t, uint64_t> > minimizers1;
  std::vector<std::pair<uint64_t, uint64_t> > minimizers2;
@@ -330,7 +335,7 @@ void Chromap<MappingRecord>::MapPairedEndReads() {
        TrimAdapterForPairedEndRead(pair_index, &read_batch1, &read_batch2);
      }
      if (!barcode_whitelist_file_path_.empty()) {
        CorrectBarcodeAt(pair_index, &barcode_batch); 
        CorrectBarcodeAt(pair_index, &barcode_batch, &thread_num_barcode_in_whitelist, &thread_num_corrected_barcode); 
      }
      minimizers1.clear();
      minimizers2.clear();
@@ -408,6 +413,8 @@ void Chromap<MappingRecord>::MapPairedEndReads() {
    std::cerr << "Mapped in " << Chromap<>::GetRealTime() - real_batch_start_time << "s.\n";
  }
  } // end of openmp single
  num_barcode_in_whitelist_ += thread_num_barcode_in_whitelist;
  num_corrected_barcode_ += thread_num_corrected_barcode;
  num_candidates_ += thread_num_candidates;
  num_mappings_ += thread_num_mappings;
  num_mapped_reads_ += thread_num_mapped_reads;
@@ -421,6 +428,9 @@ void Chromap<MappingRecord>::MapPairedEndReads() {
  std::cerr << "Mapped all reads in " << Chromap<>::GetRealTime() - real_start_mapping_time << "s.\n";
  OutputMappingStatistics();
  OutputMappingStatistics(num_reference_sequences, mappings_on_diff_ref_seqs_, mappings_on_diff_ref_seqs_);
  if (!is_bulk_data_) {
    OutputBarcodeStatistics();
  }
  if (Tn5_shift_) {
    ApplyTn5ShiftOnPairedEndMapping(num_reference_sequences, &mappings_on_diff_ref_seqs_);
  }
@@ -1477,6 +1487,12 @@ void Chromap<MappingRecord>::BandedTraceback(int min_num_errors, const char *pat
  }
}

template <typename MappingRecord>
void Chromap<MappingRecord>::OutputBarcodeStatistics() {
  std::cerr << "Number of barcodes in whitelist: " << num_barcode_in_whitelist_ << ".\n";
  std::cerr << "Number of corrected barcodes: " << num_corrected_barcode_ << ".\n";
}

template <typename MappingRecord>
void Chromap<MappingRecord>::OutputMappingStatistics() {
  std::cerr << "Number of reads: " << num_reads_ << ".\n";
+12 −1
Original line number Diff line number Diff line
@@ -30,6 +30,7 @@ struct StackCell {

KHASH_MAP_INIT_INT64(k128, uint128_t);
KHASH_MAP_INIT_INT(k32, uint32_t);
//KHASH_MAP_INIT_INT(k64, uint64_t);
KHASH_SET_INIT_INT(k32_set);

struct BarcodeWithQual {
@@ -63,18 +64,23 @@ class Chromap {
  Chromap(int kmer_size, int window_size, int num_threads, const std::string &reference_file_path, const std::string &index_file_path) : kmer_size_(kmer_size), window_size_(window_size), num_threads_(num_threads), reference_file_path_(reference_file_path), index_file_path_(index_file_path) {
    barcode_lookup_table_ = NULL;
    barcode_whitelist_lookup_table_ = NULL;
    barcode_histogram_ = NULL;
  }

  // For mapping
  Chromap(int error_threshold, int match_score, int mismatch_penalty, const std::vector<int> &gap_open_penalties, const std::vector<int> &gap_extension_penalties, int min_num_seeds_required_for_mapping, const std::vector<int> &max_seed_frequencies, int max_num_best_mappings, int max_insert_size, uint8_t mapq_threshold, int num_threads, int min_read_length, int multi_mapping_allocation_distance, int multi_mapping_allocation_seed, int drop_repetitive_reads, bool trim_adapters, bool remove_pcr_duplicates, bool is_bulk_data, bool allocate_multi_mappings, bool only_output_unique_mappings, bool Tn5_shift, bool output_mapping_in_BED, bool output_mapping_in_TagAlign, bool output_mapping_in_PAF, const std::string &reference_file_path, const std::string &index_file_path, const std::string &read_file1_path, const std::string &read_file2_path, const std::string &barcode_file_path, const std::string &barcode_whitelist_file_path, const std::string &mapping_output_file_path) : error_threshold_(error_threshold), match_score_(match_score), mismatch_penalty_(mismatch_penalty), gap_open_penalties_(gap_open_penalties), gap_extension_penalties_(gap_extension_penalties), min_num_seeds_required_for_mapping_(min_num_seeds_required_for_mapping), max_seed_frequencies_(max_seed_frequencies), max_num_best_mappings_(max_num_best_mappings), max_insert_size_(max_insert_size), mapq_threshold_(mapq_threshold), num_threads_(num_threads), min_read_length_(min_read_length), multi_mapping_allocation_distance_(multi_mapping_allocation_distance), multi_mapping_allocation_seed_(multi_mapping_allocation_seed), drop_repetitive_reads_(drop_repetitive_reads), trim_adapters_(trim_adapters), remove_pcr_duplicates_(remove_pcr_duplicates), is_bulk_data_(is_bulk_data), allocate_multi_mappings_(allocate_multi_mappings), only_output_unique_mappings_(only_output_unique_mappings), Tn5_shift_(Tn5_shift), output_mapping_in_BED_(output_mapping_in_BED), output_mapping_in_TagAlign_(output_mapping_in_TagAlign), output_mapping_in_PAF_(output_mapping_in_PAF), reference_file_path_(reference_file_path), index_file_path_(index_file_path), read_file1_path_(read_file1_path), read_file2_path_(read_file2_path), barcode_file_path_(barcode_file_path), barcode_whitelist_file_path_(barcode_whitelist_file_path), mapping_output_file_path_(mapping_output_file_path) {
    barcode_lookup_table_ = kh_init(k32);
    barcode_whitelist_lookup_table_ = kh_init(k32_set);
    barcode_histogram_ = kh_init(k64);
  }

  ~Chromap(){
    if (barcode_whitelist_lookup_table_ != NULL) {
      kh_destroy(k32_set, barcode_whitelist_lookup_table_);
    }
    if (barcode_histogram_ != NULL) {
      kh_destroy(k64, barcode_histogram_);
    }
    if (barcode_lookup_table_ != NULL) {
      kh_destroy(k32, barcode_lookup_table_);
    }
@@ -121,6 +127,7 @@ class Chromap {
  void AllocateMultiMappings(uint32_t num_reference_sequences);
  void RemovePCRDuplicate(uint32_t num_reference_sequences);
  void MoveMappingsInBuffersToMappingContainer(uint32_t num_reference_sequences, std::vector<std::vector<std::vector<MappingRecord> > > *mappings_on_diff_ref_seqs_for_diff_threads_for_saving);
  void OutputBarcodeStatistics();
  void OutputMappingStatistics();
  void OutputMappingStatistics(uint32_t num_reference_sequences, const std::vector<std::vector<MappingRecord> > &uni_mappings, const std::vector<std::vector<MappingRecord> > &multi_mappings);
  uint8_t GetMAPQ(int num_positive_candidates, int num_negative_candidates, uint16_t alignment_length, int min_num_errors, int num_best_mappings, int second_min_num_errors, int num_second_best_mappings);
@@ -128,7 +135,7 @@ class Chromap {
  void BuildAugmentedTree(uint32_t ref_id);
  uint32_t GetNumOverlappedMappings(uint32_t ref_id, const MappingRecord &mapping);
  void LoadBarcodeWhitelist();
  void CorrectBarcodeAt(uint32_t barcode_index, SequenceBatch *barcode_batch);
  void CorrectBarcodeAt(uint32_t barcode_index, SequenceBatch *barcode_batch, uint64_t *num_barcode_in_whitelist, uint64_t *num_corrected_barcode);
  void OutputMappingsInVector(uint8_t mapq_threshold, uint32_t num_reference_sequences, const SequenceBatch &reference, const std::vector<std::vector<MappingRecord> > &mappings);
  void OutputMappings(uint32_t num_reference_sequences, const SequenceBatch &reference, const std::vector<std::vector<MappingRecord> > &mappings);
  inline static double GetRealTime() {
@@ -205,6 +212,10 @@ class Chromap {
  uint64_t num_uniquely_mapped_reads_ = 0;
  uint64_t num_reads_ = 0;
  uint64_t num_duplicated_reads_ = 0; // # identical reads
  // For barcode stats
  uint64_t num_barcode_in_whitelist_ = 0;
  uint64_t num_corrected_barcode_ = 0;
  khash_t(k64)* barcode_histogram_;
};
} // namespace chromap