Commit 8c762bf2 authored by Li's avatar Li
Browse files

Implement dup and lowmapq count summary output

parent da08831a
Loading
Loading
Loading
Loading
+3 −2
Original line number Diff line number Diff line
@@ -528,6 +528,7 @@ void Chromap::MapSingleEndReads() {
    mapping_writer.OutputMappings(num_reference_sequences, reference,
                                  mappings_on_diff_ref_seqs);
  }
  mapping_writer.OutputSummaryMetadata();

  reference.FinalizeLoading();
  std::cerr << "Total time: " << GetRealTime() - real_start_time << "s.\n";
@@ -1073,7 +1074,7 @@ void Chromap::MapPairedEndReads() {
    }
    mapping_writer.OutputMappings(num_reference_sequences, reference,
                                  mappings_on_diff_ref_seqs);

    mapping_writer.OutputSummaryMetadata();
    // Temporarily disable feature matrix output. Do not delete the following
    // commented code.
    // if (!is_bulk_data_ && !matrix_output_prefix_.empty()) {
+8 −1
Original line number Diff line number Diff line
@@ -120,6 +120,8 @@ void ChromapDriver::ParseArgsAndRun(int argc, char *argv[]) {
          cxxopts::value<std::string>(),
          "FILE")("barcode-translate",
                  "Convert barcode to the specified sequences during output",
                  cxxopts::value<std::string>(), "FILE")
            ("summary", "Summarize the mapping metadata at bulk or barcode level",
             cxxopts::value<std::string>(), "FILE");
  //("PAF", "Output mappings in PAF format (only for test)");
  options.add_options()("v,version", "Print version")("h,help", "Print help");
@@ -429,6 +431,11 @@ void ChromapDriver::ParseArgsAndRun(int argc, char *argv[]) {
          result["barcode-translate"].as<std::string>();
    }
    
    if (result.count("summary")) {
      mapping_parameters.summary_metadata_file_path = 
        result["summary"].as<std::string>();
    }

    if (result.count("skip-barcode-check")) {
      mapping_parameters.skip_barcode_check = true;
    }
+1 −0
Original line number Diff line number Diff line
@@ -66,6 +66,7 @@ struct MappingParameters {
  // The order for pairs format flipping.
  std::string pairs_flipping_custom_rid_order_file_path;
  std::string barcode_translate_table_file_path;
  std::string summary_metadata_file_path;
  bool skip_barcode_check = false;

  int GetNumVPULanes() const {
+26 −1
Original line number Diff line number Diff line
@@ -21,6 +21,7 @@
#include "sequence_batch.h"
#include "temp_mapping.h"
#include "utils.h"
#include "summary_metadata.h"

namespace chromap {

@@ -39,7 +40,7 @@ class MappingWriter {
      barcode_translator_.SetTranslateTable(
          mapping_parameters_.barcode_translate_table_file_path);
    }

    summary_metadata_.SetBarcodeLength(cell_barcode_length);
    mapping_output_file_ =
        fopen(mapping_parameters_.mapping_output_file_path.c_str(), "w");
    assert(mapping_output_file_ != nullptr);
@@ -67,6 +68,8 @@ class MappingWriter {
      std::vector<TempMappingFileHandle<MappingRecord>>
          &temp_mapping_file_handles);

  void OutputSummaryMetadata();

 protected:
  void AppendMapping(uint32_t rid, const SequenceBatch &reference,
                     const MappingRecord &mapping);
@@ -110,6 +113,8 @@ class MappingWriter {
  const uint32_t cell_barcode_length_;
  FILE *mapping_output_file_ = nullptr;
  BarcodeTranslator barcode_translator_;
  SummaryMetadata summary_metadata_;

  // for pairs
  const std::vector<int> pairs_custom_rid_rank_;
};
@@ -277,6 +282,13 @@ void MappingWriter<MappingRecord>::ProcessAndOutputMappingsInLowMemory(
          }
          AppendMapping(last_rid, reference, last_mapping);
          ++num_mappings_passing_filters;
          if (!mapping_parameters_.summary_metadata_file_path.empty())
            summary_metadata_.UpdateCount(last_mapping.GetBarcode(), SUMMARY_METADATA_DUP,
                last_mapping.num_dups_ - 1);
        } else {
          if (!mapping_parameters_.summary_metadata_file_path.empty())
            summary_metadata_.UpdateCount(last_mapping.GetBarcode(), SUMMARY_METADATA_LOWMAPQ, 
                last_mapping.num_dups_);
        }

        if (last_mapping.is_unique_ == 1) {
@@ -379,6 +391,13 @@ void MappingWriter<MappingRecord>::OutputMappingsInVector(
        AppendMapping(ri, reference, *it);
        ++num_mappings_passing_filters;
        //}
        if (!mapping_parameters_.summary_metadata_file_path.empty())
          summary_metadata_.UpdateCount(it->GetBarcode(), SUMMARY_METADATA_DUP,
              it->num_dups_ - 1);
      } else {
        if (!mapping_parameters_.summary_metadata_file_path.empty())
          summary_metadata_.UpdateCount(it->GetBarcode(), SUMMARY_METADATA_LOWMAPQ,
              it->num_dups_);
      }
    }
  }
@@ -396,6 +415,12 @@ void MappingWriter<MappingRecord>::OutputMappings(
                         num_reference_sequences, reference, mappings);
}

template <typename MappingRecord>
void MappingWriter<MappingRecord>::OutputSummaryMetadata() {
  if (!mapping_parameters_.summary_metadata_file_path.empty())
    summary_metadata_.Output(mapping_parameters_.summary_metadata_file_path.c_str());
}

// Specialization for BED format.
template <>
void MappingWriter<MappingWithBarcode>::OutputHeader(
+16 −6
Original line number Diff line number Diff line
@@ -13,7 +13,7 @@

namespace chromap {

enum {
enum SummaryMetadataField {
  SUMMARY_METADATA_TOTAL = 0,
  SUMMARY_METADATA_DUP,
  SUMMARY_METADATA_UNMAPPED,
@@ -24,7 +24,7 @@ enum {
struct _barcodeSummaryMetadata {
  int counts[SUMMARY_METADATA_FIELDS];
  _barcodeSummaryMetadata() {
    memset(counts, 0, sizeof(counts));
    memset(counts, 0, sizeof(int) * SUMMARY_METADATA_FIELDS);
  }
};

@@ -35,18 +35,19 @@ class SummaryMetadata {
 public:
  SummaryMetadata() {
    barcode_metadata_ = kh_init(k64_barcode_metadata);
    barcode_length_ = 16;
  }
  ~SummaryMetadata() {
    kh_destroy(k64_barcode_metadata, barcode_metadata_);
  }

  void Output(char *filename, int barcode_length) {
  void Output(const char *filename) {
    FILE *fp = fopen(filename, "w") ;
    fprintf(fp, "barcode,total,duplicate,unmapped,lowmapq");   
    fprintf(fp, "barcode,total,duplicate,unmapped,lowmapq\n");   
    khiter_t k;
    for (k = kh_begin(barcode_metadata_); k != kh_end(barcode_metadata_); ++k)
      if (kh_exist(barcode_metadata_, k)) {
        fprintf(fp, "%s", Seed2Sequence(kh_key(barcode_metadata_, k), barcode_length).c_str());
        fprintf(fp, "%s", Seed2Sequence(kh_key(barcode_metadata_, k), barcode_length_).c_str());
        int i;
        for (i = 0; i < SUMMARY_METADATA_FIELDS; ++i) {
          fprintf(fp, ",%d", kh_value(barcode_metadata_, k).counts[i]);
@@ -59,11 +60,20 @@ class SummaryMetadata {
  void UpdateCount(uint64_t barcode, int type, int change) {
    int khash_return_code;
    khiter_t barcode_metadata_iter = kh_put(k64_barcode_metadata, barcode_metadata_, barcode, &khash_return_code);
    if (khash_return_code) {
      struct _barcodeSummaryMetadata nb;
      kh_value(barcode_metadata_, barcode_metadata_iter) = nb;
    }
    kh_value(barcode_metadata_, barcode_metadata_iter).counts[type] += change;
  }

  void SetBarcodeLength(int l) {
    barcode_length_ = l;
  }

 private:
  khash_t(k64_barcode_metadata) *barcode_metadata_;    
  int barcode_length_;

  std::string Seed2Sequence(uint64_t seed, uint32_t seed_length) const {
    std::string sequence;