Commit ac1d8ce1 authored by Haowen Zhang's avatar Haowen Zhang
Browse files

Clean up the code of MappingWriter.

I added a function to sample the first 1000 barcodes and check
whether their lengths are equal to each other's and those in the
whitelist. A check on whether barcode length exceeds 32bp was also
added.
parent 312caba1
Loading
Loading
Loading
Loading
+54 −34
Original line number Diff line number Diff line
@@ -369,6 +369,31 @@ bool Chromap<MappingRecord>::CorrectBarcodeAt(
  }
}

template <typename MappingRecord>
uint32_t Chromap<MappingRecord>::SampleInputBarcodesAndExamineLength() {
  if (is_bulk_data_) {
    return 0;
  }

  uint32_t sample_batch_size = 1000;
  SequenceBatch barcode_batch(sample_batch_size);

  barcode_batch.InitializeLoading(barcode_file_paths_[0]);

  uint32_t num_loaded_barcodes = barcode_batch.LoadBatch();

  uint32_t cell_barcode_length = barcode_batch.GetSequenceLengthAt(0);
  for (uint32_t i = 1; i < num_loaded_barcodes; ++i) {
    if (barcode_batch.GetSequenceLengthAt(i) != cell_barcode_length) {
      ExitWithMessage("ERROR: Barcode lengths are not equal in the sample!");
    }
  }

  barcode_batch.FinalizeLoading();

  return cell_barcode_length;
}

template <typename MappingRecord>
uint32_t Chromap<MappingRecord>::LoadPairedEndReadsWithBarcodes(
    SequenceBatch &read_batch1, SequenceBatch &read_batch2,
@@ -848,25 +873,13 @@ void Chromap<MappingRecord>::MapPairedEndReads() {

  // Preprocess barcodes for single cell data
  if (!is_bulk_data_) {
    barcode_length_ = SampleInputBarcodesAndExamineLength();
    if (!barcode_whitelist_file_path_.empty()) {
      LoadBarcodeWhitelist();
      ComputeBarcodeAbundance(initial_num_sample_barcodes_);
    }
  }

  MappingWriter<MappingRecord> mapping_writer;
  if (!barcode_translate_table_path_.empty()) {
    mapping_writer.SetBarcodeTranslateTable(barcode_translate_table_path_);
  }
  // Initialize output tools
  if (mapping_output_format_ == MAPPINGFORMAT_PAIRS) {
    mapping_writer.SetPairsCustomRidRank(pairs_custom_rid_rank_);
  }

  mapping_writer.InitializeMappingOutput(
      barcode_length_, mapping_output_file_path_, mapping_output_format_);
  mapping_writer.OutputHeader(num_reference_sequences, reference);

  CandidateProcessor candidate_processor(min_num_seeds_required_for_mapping_,
                                         max_seed_frequencies_);

@@ -880,6 +893,14 @@ void Chromap<MappingRecord>::MapPairedEndReads() {
      max_insert_size_, min_read_length_, drop_repetitive_reads_, is_bulk_data_,
      split_alignment_, mapping_output_format_, pairs_custom_rid_rank_);

  MappingWriter<MappingRecord> mapping_writer(
      mapping_output_file_path_, mapping_output_format_, barcode_length_,
      barcode_translate_table_path_.empty() ? nullptr
                                            : &barcode_translate_table_path_,
      mapping_output_format_ == MAPPINGFORMAT_PAIRS ? &pairs_custom_rid_rank_
                                                    : nullptr);
  mapping_writer.OutputHeader(num_reference_sequences, reference);

  uint32_t num_mappings_in_mem = 0;
  uint64_t max_num_mappings_in_mem =
      1 * ((uint64_t)1 << 30) / sizeof(MappingRecord);
@@ -921,10 +942,6 @@ void Chromap<MappingRecord>::MapPairedEndReads() {
    read_batch2_for_loading.SwapSequenceBatch(read_batch2);
    if (!is_bulk_data_) {
      barcode_batch_for_loading.SwapSequenceBatch(barcode_batch);
      // TODO(Haowen): simplify this condition check.
      if (num_loaded_pairs > 0) {
        mapping_writer.SetBarcodeLength(barcode_batch.GetSequenceLengthAt(0));
      }
    }

    // Setup thread private vectors to save mapping results.
@@ -1355,8 +1372,6 @@ void Chromap<MappingRecord>::MapPairedEndReads() {
    //}
  }

  mapping_writer.FinalizeMappingOutput();

  reference.FinalizeLoading();

  std::cerr << "Total time: " << GetRealTime() - real_start_time << "s.\n";
@@ -1464,6 +1479,7 @@ void Chromap<MappingRecord>::MapSingleEndReads() {

  // Preprocess barcodes for single cell data
  if (!is_bulk_data_) {
    barcode_length_ = SampleInputBarcodesAndExamineLength();
    if (!barcode_whitelist_file_path_.empty()) {
      LoadBarcodeWhitelist();
      ComputeBarcodeAbundance(initial_num_sample_barcodes_);
@@ -1483,12 +1499,12 @@ void Chromap<MappingRecord>::MapSingleEndReads() {
      max_insert_size_, min_read_length_, drop_repetitive_reads_, is_bulk_data_,
      split_alignment_, mapping_output_format_, pairs_custom_rid_rank_);

  MappingWriter<MappingRecord> mapping_writer;
  if (!barcode_translate_table_path_.empty()) {
    mapping_writer.SetBarcodeTranslateTable(barcode_translate_table_path_);
  }
  mapping_writer.InitializeMappingOutput(
      barcode_length_, mapping_output_file_path_, mapping_output_format_);
  MappingWriter<MappingRecord> mapping_writer(
      mapping_output_file_path_, mapping_output_format_, barcode_length_,
      barcode_translate_table_path_.empty() ? nullptr
                                            : &barcode_translate_table_path_,
      /*custom_rid_rank=*/nullptr);

  mapping_writer.OutputHeader(num_reference_sequences, reference);

  uint32_t num_mappings_in_mem = 0;
@@ -1531,10 +1547,6 @@ void Chromap<MappingRecord>::MapSingleEndReads() {

    if (!is_bulk_data_) {
      barcode_batch_for_loading.SwapSequenceBatch(barcode_batch);
      // TODO(Haowen): simplify this condition check.
      if (num_loaded_reads > 0) {
        mapping_writer.SetBarcodeLength(barcode_batch.GetSequenceLengthAt(0));
      }
    }

    std::vector<std::vector<std::vector<MappingRecord>>>
@@ -1783,7 +1795,6 @@ void Chromap<MappingRecord>::MapSingleEndReads() {
                   mappings_on_diff_ref_seqs, mapping_writer);
  }

  mapping_writer.FinalizeMappingOutput();
  reference.FinalizeLoading();
  std::cerr << "Total time: " << GetRealTime() - real_start_time << "s.\n";
}
@@ -1938,11 +1949,20 @@ void Chromap<MappingRecord>::LoadBarcodeWhitelist() {
    std::string barcode;
    barcode_whitelist_file_line_string_stream >> barcode;
    size_t barcode_length = barcode.length();
    if (barcode_length > 32) {
      ExitWithMessage("ERROR: barcode length is greater than 32!");
    }

    if (barcode_length != barcode_length_) {
      if (num_barcodes == 0) {
      barcode_length_ = barcode_length;
        ExitWithMessage(
            "ERROR: whitelist and input barcode lengths are not equal!");
      } else {
      assert(barcode_length == barcode_length_);
        ExitWithMessage(
            "ERROR: barcode lengths are not equal in the whitelist!");
      }
    }

    // if (first_line) {
    //  //size_t barcode_length = kmer.length();
    //  // Allocate memory to save pore model parameters
+2 −0
Original line number Diff line number Diff line
@@ -144,6 +144,8 @@ class Chromap {
  void MapPairedEndReads();

 private:
  uint32_t SampleInputBarcodesAndExamineLength();

  uint32_t LoadSingleEndReadsWithBarcodes(SequenceBatch *read_batch,
                                          SequenceBatch *barcode_batch);

+1 −1
Original line number Diff line number Diff line
@@ -378,7 +378,7 @@ void MappingWriter<PairsMapping>::OutputHeader(uint32_t num_reference_sequences,
  rid_order.resize(num_reference_sequences);
  uint32_t i;
  for (i = 0; i < num_reference_sequences; ++i) {
    rid_order[this->custom_rid_rank_[i]] = i;
    rid_order[custom_rid_rank_[i]] = i;
  }
  this->AppendMappingOutput("## pairs format v1.0.0\n#shape: upper triangle\n");
  for (i = 0; i < num_reference_sequences; ++i) {
+30 −35
Original line number Diff line number Diff line
@@ -24,8 +24,29 @@ namespace chromap {
template <typename MappingRecord>
class MappingWriter {
 public:
  MappingWriter() {}
  ~MappingWriter() {}
  MappingWriter() = delete;

  MappingWriter(const std::string &mapping_output_file_path,
                MappingOutputFormat &mapping_output_format,
                uint32_t cell_barcode_length,
                const std::string *barcode_translate_table_file_path,
                const std::vector<int> *custom_rid_rank)
      : mapping_output_file_path_(mapping_output_file_path),
        mapping_output_format_(mapping_output_format),
        cell_barcode_length_(cell_barcode_length) {
    if (barcode_translate_table_file_path != nullptr) {
      barcode_translator_.SetTranslateTable(*barcode_translate_table_file_path);
    }

    if (custom_rid_rank != nullptr) {
      custom_rid_rank_ = *custom_rid_rank;
    }

    mapping_output_file_ = fopen(mapping_output_file_path_.c_str(), "w");
    assert(mapping_output_file_ != nullptr);
  }

  ~MappingWriter() { fclose(mapping_output_file_); }

  // Output the mappings in a temp file.
  inline void OutputTempMapping(
@@ -47,33 +68,16 @@ class MappingWriter {
    fclose(temp_mapping_output_file);
  }

  inline void InitializeMappingOutput(
      uint32_t cell_barcode_length, const std::string &mapping_output_file_path,
      MappingOutputFormat &format) {
    cell_barcode_length_ = cell_barcode_length;
    mapping_output_file_path_ = mapping_output_file_path;
    mapping_output_file_ = fopen(mapping_output_file_path_.c_str(), "w");
    assert(mapping_output_file_ != NULL);
    mapping_output_format_ = format;
  }

  inline void SetBarcodeLength(uint32_t cell_barcode_length) {
    cell_barcode_length_ = cell_barcode_length;
  }

  inline void FinalizeMappingOutput() { fclose(mapping_output_file_); }

  inline void AppendMappingOutput(const std::string &line) {
    fprintf(mapping_output_file_, "%s", line.data());
  }

  void OutputHeader(uint32_t num_reference_sequences,
                    const SequenceBatch &reference);

  void AppendMapping(uint32_t rid, const SequenceBatch &reference,
                     const MappingRecord &mapping);

  inline uint32_t GetNumMappings() const { return num_mappings_; }
 protected:
  inline void AppendMappingOutput(const std::string &line) {
    fprintf(mapping_output_file_, "%s", line.data());
  }

  inline std::string Seed2Sequence(uint64_t seed, uint32_t seed_length) const {
    std::string sequence;
@@ -86,24 +90,15 @@ class MappingWriter {
    return sequence;
  }

  inline void SetPairsCustomRidRank(const std::vector<int> &custom_rid_rank) {
    custom_rid_rank_ = custom_rid_rank;
  }

  inline void SetBarcodeTranslateTable(const std::string &file) {
    barcode_translator_.SetTranslateTable(file);
  }

  std::vector<int> custom_rid_rank_;  // for pairs
 protected:
  std::string mapping_output_file_path_;
  FILE *mapping_output_file_;
  // TODO(Haowen): use this variable to decide output in BED or TagAlign. It
  // should be removed later.
  MappingOutputFormat mapping_output_format_ = MAPPINGFORMAT_BED;
  uint32_t num_mappings_;
  uint32_t cell_barcode_length_ = 16;
  FILE *mapping_output_file_ = nullptr;
  BarcodeTranslator barcode_translator_;
  // for pairs
  std::vector<int> custom_rid_rank_;
};

// Specialization for BED format.