Commit ac2692de authored by Haowen Zhang's avatar Haowen Zhang
Browse files

Improve barcode correction

parent 4a00cfa5
Loading
Loading
Loading
Loading
+60 −19
Original line number Diff line number Diff line
@@ -133,7 +133,7 @@ template <typename MappingRecord>
void Chromap<MappingRecord>::CorrectBarcodeAt(uint32_t barcode_index, SequenceBatch *barcode_batch, uint64_t *num_barcode_in_whitelist, uint64_t *num_corrected_barcode) {
  uint32_t barcode_length = barcode_batch->GetSequenceLengthAt(barcode_index);
  uint32_t barcode_key = barcode_batch->GenerateSeedFromSequenceAt(barcode_index, 0, barcode_length);
  khiter_t barcode_whitelist_lookup_table_iterator = kh_get(k32_set, barcode_whitelist_lookup_table_, barcode_key);
  khiter_t barcode_whitelist_lookup_table_iterator = kh_get(k32, barcode_whitelist_lookup_table_, barcode_key);
  if (barcode_whitelist_lookup_table_iterator != kh_end(barcode_whitelist_lookup_table_)) {
    // Correct barcode
    ++(*num_barcode_in_whitelist);
@@ -155,10 +155,12 @@ void Chromap<MappingRecord>::CorrectBarcodeAt(uint32_t barcode_index, SequenceBa
        base_to_change &= mask;
        // generate the corrected key
        uint32_t corrected_barcode_key = barcode_key_to_change | (base_to_change << (2 * i));
        barcode_whitelist_lookup_table_iterator = kh_get(k32_set, barcode_whitelist_lookup_table_, corrected_barcode_key);
        barcode_whitelist_lookup_table_iterator = kh_get(k32, barcode_whitelist_lookup_table_, corrected_barcode_key);
        double barcode_abundance = kh_value(barcode_whitelist_lookup_table_, barcode_whitelist_lookup_table_iterator) / (double)num_sample_barcodes_;
        double score = (1 - pow(10.0, ((-(int)barcode_qual[barcode_length - 1 - i]) / 10.0))) * barcode_abundance;
        if (barcode_whitelist_lookup_table_iterator != kh_end(barcode_whitelist_lookup_table_)) {
          // find one possible corrected barcode
          corrected_barcodes_with_quals.emplace_back(BarcodeWithQual{barcode_length - 1 - i, SequenceBatch::Uint8ToChar(base_to_change), barcode_qual[barcode_length - 1 - i]});
          corrected_barcodes_with_quals.emplace_back(BarcodeWithQual{barcode_length - 1 - i, SequenceBatch::Uint8ToChar(base_to_change), barcode_qual[barcode_length - 1 - i], barcode_abundance, score});
        }
      }
    }
@@ -169,26 +171,32 @@ void Chromap<MappingRecord>::CorrectBarcodeAt(uint32_t barcode_index, SequenceBa
      // Just correct it
      //std::cerr << "Corrected the barcode from " << barcode << " to ";
      barcode_batch->CorrectBaseAt(barcode_index, corrected_barcodes_with_quals[0].corrected_base_index, corrected_barcodes_with_quals[0].correct_base);
      //std::cerr << "score: " << corrected_barcodes_with_quals[0].score << "\n";
      ++(*num_corrected_barcode);
      //std::cerr << barcode << "\n";
    } else {
      // Randomly select one of the best corrections
      std::sort(corrected_barcodes_with_quals.begin(), corrected_barcodes_with_quals.end(), std::greater<BarcodeWithQual>());
      int num_ties = 0;
      for (size_t ci = 1; ci < num_possible_corrected_barcodes; ++ci) {
        if (corrected_barcodes_with_quals[ci].qual == corrected_barcodes_with_quals[0].qual) {
          ++num_ties;
        }
      //int num_ties = 0;
      double sum_score = 0;
      for (size_t ci = 0; ci < num_possible_corrected_barcodes; ++ci) {
        sum_score += corrected_barcodes_with_quals[ci].score;
        //if (corrected_barcodes_with_quals[ci].qual == corrected_barcodes_with_quals[0].qual) {
        //  ++num_ties;
        //}
      }
      int best_corrected_barcode_index = 0;
      if (num_ties > 0) {
        std::mt19937 tmp_generator(11);
        std::uniform_int_distribution<int> distribution(0, num_ties); // important: inclusive range
        best_corrected_barcode_index = distribution(tmp_generator);
      }
      //if (num_ties > 0) {
      //  std::mt19937 tmp_generator(11);
      //  std::uniform_int_distribution<int> distribution(0, num_ties); // important: inclusive range
      //  best_corrected_barcode_index = distribution(tmp_generator);
      //}
      //std::cerr << "Corrected the barcode from " << barcode << " to ";
      if (corrected_barcodes_with_quals[best_corrected_barcode_index].score / sum_score >= 0.9) {
        barcode_batch->CorrectBaseAt(barcode_index, corrected_barcodes_with_quals[best_corrected_barcode_index].corrected_base_index, corrected_barcodes_with_quals[best_corrected_barcode_index].correct_base);
        //std::cerr << "score: " << corrected_barcodes_with_quals[best_corrected_barcode_index].score << "\n";
        ++(*num_corrected_barcode);
      }
      //std::cerr << barcode << "\n";
    }
  }
@@ -473,6 +481,37 @@ uint32_t Chromap<MappingRecord>::LoadPairedEndReadsWithBarcodes(SequenceBatch *r
  return num_loaded_pairs;
}

template <typename MappingRecord>
void Chromap<MappingRecord>::ComputeBarcodeAbundance(uint64_t max_num_sample_barcodes) {
  double real_start_time = Chromap<>::GetRealTime();
  SequenceBatch barcode_batch(read_batch_size_);
  for (size_t read_file_index = 0; read_file_index < read_file1_paths_.size(); ++read_file_index) {
    barcode_batch.InitializeLoading(barcode_file_paths_[read_file_index]);
    uint32_t num_loaded_barcodes = barcode_batch.LoadBatch();
    while (num_loaded_barcodes > 0) {
      for (uint32_t barcode_index = 0; barcode_index < num_loaded_barcodes; ++barcode_index) {
        uint32_t barcode_length = barcode_batch.GetSequenceLengthAt(barcode_index);
        uint32_t barcode_key = barcode_batch.GenerateSeedFromSequenceAt(barcode_index, 0, barcode_length);
        khiter_t barcode_whitelist_lookup_table_iterator = kh_get(k32, barcode_whitelist_lookup_table_, barcode_key);
        if (barcode_whitelist_lookup_table_iterator != kh_end(barcode_whitelist_lookup_table_)) {
          // Correct barcode
          kh_value(barcode_whitelist_lookup_table_, barcode_whitelist_lookup_table_iterator) += 1;
          ++num_sample_barcodes_;
        }
      }
      if (num_sample_barcodes_ >= max_num_sample_barcodes) {
        break;
      }
      num_loaded_barcodes = barcode_batch.LoadBatch();
    }
    barcode_batch.FinalizeLoading();
    if (num_sample_barcodes_ >= max_num_sample_barcodes) {
      break;
    }
  }
  std::cerr << "Compute barcode abundance using " << num_sample_barcodes_ << " in "<< Chromap<>::GetRealTime() - real_start_time << "s.\n";
}

template <typename MappingRecord>
void Chromap<MappingRecord>::MapPairedEndReads() {
  double real_start_time = Chromap<>::GetRealTime();
@@ -513,6 +552,7 @@ void Chromap<MappingRecord>::MapPairedEndReads() {
  if (!is_bulk_data_) {
    if (!barcode_whitelist_file_path_.empty()) {
      LoadBarcodeWhitelist();
      ComputeBarcodeAbundance(20000000);
    }
  }
  static uint64_t thread_num_candidates = 0;
@@ -984,11 +1024,11 @@ void Chromap<MappingRecord>::OutputMappingsInVector(uint8_t mapq_threshold, uint
  for (uint32_t ri = 0; ri < num_reference_sequences; ++ri) {
    for (auto it = mappings[ri].begin(); it != mappings[ri].end(); ++it) {
      uint8_t mapq = (it->mapq);
      uint8_t is_unique = (it->is_unique);
      //uint8_t is_unique = (it->is_unique);
      if (mapq >= mapq_threshold) {
        if (allocate_multi_mappings_ || (only_output_unique_mappings_ && is_unique == 1)) {
        //if (allocate_multi_mappings_ || (only_output_unique_mappings_ && is_unique == 1)) {
          output_tools_->AppendMapping(ri, reference, *it);
        }
        //}
      }
    }
  }
@@ -2668,7 +2708,8 @@ void Chromap<MappingRecord>::LoadBarcodeWhitelist() {
    //PoreModelParameters &pore_model_parameters = pore_models_[kmer_hash_value];
    //barcode_whitelist_file_line_string_stream >> pore_model_parameters.level_mean >> pore_model_parameters.level_stdv >> pore_model_parameters.sd_mean >> pore_model_parameters.sd_stdv;
    int khash_return_code;
    kh_put(k32_set, barcode_whitelist_lookup_table_, barcode_key, &khash_return_code);
    khiter_t barcode_whitelist_lookup_table_iterator = kh_put(k32, barcode_whitelist_lookup_table_, barcode_key, &khash_return_code);
    kh_value(barcode_whitelist_lookup_table_, barcode_whitelist_lookup_table_iterator) = 0;
    assert(khash_return_code != -1 && khash_return_code != 0);
    ++num_barcodes;
  }
+9 −4
Original line number Diff line number Diff line
@@ -43,8 +43,10 @@ struct BarcodeWithQual {
  uint32_t corrected_base_index;
  char correct_base;
  char qual;
  double abundance;
  double score;
  bool operator>(const BarcodeWithQual& b) const {
    return std::tie(qual, corrected_base_index, correct_base) > std::tie(b.qual, b.corrected_base_index, b.correct_base);
    return std::tie(score, corrected_base_index, correct_base) > std::tie(b.score, b.corrected_base_index, b.correct_base);
  }
};

@@ -77,7 +79,7 @@ class Chromap {
  // For mapping
  Chromap(int error_threshold, int match_score, int mismatch_penalty, const std::vector<int> &gap_open_penalties, const std::vector<int> &gap_extension_penalties, int min_num_seeds_required_for_mapping, const std::vector<int> &max_seed_frequencies, int max_num_best_mappings, int max_insert_size, uint8_t mapq_threshold, int num_threads, int min_read_length, int multi_mapping_allocation_distance, int multi_mapping_allocation_seed, int drop_repetitive_reads, bool trim_adapters, bool remove_pcr_duplicates, bool is_bulk_data, bool allocate_multi_mappings, bool only_output_unique_mappings, bool Tn5_shift, bool output_mapping_in_BED, bool output_mapping_in_TagAlign, bool output_mapping_in_PAF, bool low_memory_mode, bool cell_by_bin, int bin_size, uint16_t depth_cutoff_to_call_peak, int peak_min_length, int peak_merge_max_length, const std::string &reference_file_path, const std::string &index_file_path, const std::vector<std::string> &read_file1_paths, const std::vector<std::string> &read_file2_paths, const std::vector<std::string> &barcode_file_paths, const std::string &barcode_whitelist_file_path, const std::string &mapping_output_file_path, const std::string &matrix_output_prefix) : error_threshold_(error_threshold), match_score_(match_score), mismatch_penalty_(mismatch_penalty), gap_open_penalties_(gap_open_penalties), gap_extension_penalties_(gap_extension_penalties), min_num_seeds_required_for_mapping_(min_num_seeds_required_for_mapping), max_seed_frequencies_(max_seed_frequencies), max_num_best_mappings_(max_num_best_mappings), max_insert_size_(max_insert_size), mapq_threshold_(mapq_threshold), num_threads_(num_threads), min_read_length_(min_read_length), multi_mapping_allocation_distance_(multi_mapping_allocation_distance), multi_mapping_allocation_seed_(multi_mapping_allocation_seed), drop_repetitive_reads_(drop_repetitive_reads), trim_adapters_(trim_adapters), remove_pcr_duplicates_(remove_pcr_duplicates), is_bulk_data_(is_bulk_data), allocate_multi_mappings_(allocate_multi_mappings), only_output_unique_mappings_(only_output_unique_mappings), Tn5_shift_(Tn5_shift), output_mapping_in_BED_(output_mapping_in_BED), output_mapping_in_TagAlign_(output_mapping_in_TagAlign), output_mapping_in_PAF_(output_mapping_in_PAF), low_memory_mode_(low_memory_mode), cell_by_bin_(cell_by_bin), bin_size_(bin_size), depth_cutoff_to_call_peak_(depth_cutoff_to_call_peak), peak_min_length_(peak_min_length), peak_merge_max_length_(peak_merge_max_length), reference_file_path_(reference_file_path), index_file_path_(index_file_path), read_file1_paths_(read_file1_paths), read_file2_paths_(read_file2_paths), barcode_file_paths_(barcode_file_paths), barcode_whitelist_file_path_(barcode_whitelist_file_path), mapping_output_file_path_(mapping_output_file_path), matrix_output_prefix_(matrix_output_prefix) {
    barcode_lookup_table_ = kh_init(k32);
    barcode_whitelist_lookup_table_ = kh_init(k32_set);
    barcode_whitelist_lookup_table_ = kh_init(k32);
    barcode_histogram_ = kh_init(k32);
    barcode_index_table_ = kh_init(k32);
    NUM_VPU_LANES_ = 0;
@@ -90,7 +92,7 @@ class Chromap {

  ~Chromap(){
    if (barcode_whitelist_lookup_table_ != NULL) {
      kh_destroy(k32_set, barcode_whitelist_lookup_table_);
      kh_destroy(k32, barcode_whitelist_lookup_table_);
    }
    if (barcode_histogram_ != NULL) {
      kh_destroy(k32, barcode_histogram_);
@@ -157,6 +159,7 @@ class Chromap {
  void BuildAugmentedTree(uint32_t ref_id);
  uint32_t GetNumOverlappedMappings(uint32_t ref_id, const MappingRecord &mapping);
  void LoadBarcodeWhitelist();
  void ComputeBarcodeAbundance(uint64_t max_num_sample_barcodes);
  void CorrectBarcodeAt(uint32_t barcode_index, SequenceBatch *barcode_batch, uint64_t *num_barcode_in_whitelist, uint64_t *num_corrected_barcode);
  uint32_t CallPeaks(uint16_t coverage_threshold, uint32_t num_reference_sequences, const SequenceBatch &reference);
  void OutputFeatureMatrix(uint32_t num_sequences, const SequenceBatch &reference);
@@ -226,7 +229,8 @@ class Chromap {
  std::string mapping_output_file_path_;
  FILE *mapping_output_file_;
  std::string matrix_output_prefix_;
  khash_t(k32_set)* barcode_whitelist_lookup_table_;
  //khash_t(k32_set)* barcode_whitelist_lookup_table_;
  khash_t(k32)* barcode_whitelist_lookup_table_;
  // For identical read dedupe
  int allocated_barcode_lookup_table_size_ = (1 << 10);
  khash_t(k32)* barcode_lookup_table_;
@@ -249,6 +253,7 @@ class Chromap {
  uint64_t num_reads_ = 0;
  uint64_t num_duplicated_reads_ = 0; // # identical reads
  // For barcode stats
  uint64_t num_sample_barcodes_ = 0;
  uint64_t num_barcode_in_whitelist_ = 0;
  uint64_t num_corrected_barcode_ = 0;
  khash_t(k32)* barcode_histogram_;