Commit 5d34dcc0 authored by oma219's avatar oma219 Committed by Li Song
Browse files

added options/code for num cache slots computation/turned it on by default/outputs in summary file

parent 63b06d3b
Loading
Loading
Loading
Loading
+94 −14
Original line number Diff line number Diff line
@@ -9,6 +9,9 @@
#include <tuple>
#include <vector>

#include <queue> // Used these two for k-minhash
#include <unordered_set>

#include <sstream> // Used for frip est params splitting

#include "candidate_processor.h"
@@ -35,6 +38,49 @@

namespace chromap {

class K_MinHash {
public:
    /*
     * MinHash Class - used to estimate the number of unique cache slots 
     *                 hit by each barcode
     *
     * @param k - size of MinHash sketch
     * @param range - range of possible cache ids
     */
    K_MinHash(size_t k, size_t range) : k_(k), range_(range) {}

    // K_MinHash(): k_(250), range_(4000003) {}

    void add(size_t num) {
      /* If num is not present in queue, we will add it */
        if (unique_slots_.find(num) == unique_slots_.end()) {
            unique_slots_.insert(num);
            pq_.push(num);
            // only keep smallest k numbers
            if (pq_.size() > k_) {
                unique_slots_.erase(pq_.top());
                pq_.pop();
            }
        }
    }

    size_t compute_cardinality() {
      /* Use k-MinHash estimator to return estimated cardinality */
      size_t k_for_calc = k_;
      if (pq_.size() < k_) {k_for_calc = pq_.size();}
      size_t cardinality = (k_for_calc * range_)/pq_.top() - 1;
      return cardinality;
    }

private:
    size_t k_;
    size_t range_;

    /* Uses an unordered set to have O(1) find queries*/
    std::priority_queue<uint32_t> pq_; // max-heap
    std::unordered_set<uint32_t> unique_slots_; // keep track of unique values
};

class Chromap {
 public:
  Chromap() = delete;
@@ -635,6 +681,19 @@ void Chromap::MapPairedEndReads() {
  
  std::vector<uint64_t> seeds_for_batch(500000, 0);

  // Variables used for counting number of associated cache slots
  bool output_num_cache_slots_info = mapping_parameters_.output_num_uniq_cache_slots;
  const size_t k_for_minhash = mapping_parameters_.k_for_minhash;

  std::cerr << "Output number of associated cache slots: " << output_num_cache_slots_info << std::endl;
  std::cerr << "K for MinHash: " << k_for_minhash << std::endl;

  int num_locks_for_map = 1000;
  omp_lock_t map_locks[num_locks_for_map];
  for (int i = 0; i < num_locks_for_map; ++i) {omp_init_lock(&map_locks[i]);}
  
  std::vector<std::unordered_map<size_t, K_MinHash>> barcode_peak_map(num_locks_for_map);

  // Parse out the parameters for chromap score (const, fric, dup, unmapped, lowmapq)
  std::vector<double> frip_est_params; 
  std::stringstream ss(mapping_parameters_.frip_est_params);
@@ -887,24 +946,24 @@ void Chromap::MapPairedEndReads() {
                size_t current_num_candidates2 = paired_end_mapping_metadata.mapping_metadata2_.GetNumCandidates();

                // increment variable for cache_hits
                //bool curr_read_hit_cache = false;
                bool curr_read_hit_cache = false;
                if (cache_query_result1 >= 0 || cache_query_result2 >= 0) {
                  cache_hits_for_batch++;
                  //curr_read_hit_cache = true;
                  curr_read_hit_cache = true;
                }

                // update the peak counting data-structure
                // if (output_peak_info && curr_read_hit_cache) {
                //   // calculate which map this barcode is in
                //   size_t map_id = curr_seed_val % num_locks_for_map;
                
                //   // grab lock for this map, and add to the K-MinHash for this particular barcode
                //   omp_set_lock(&map_locks[map_id]);
                //   auto it = barcode_peak_map[map_id].emplace(curr_seed_val, K_MinHash(k_for_minhash, mapping_parameters_.cache_size)).first;
                //   if (cache_query_result1 >= 0) {it->second.add(cache_query_result1);}
                //   if (cache_query_result2 >= 0) {it->second.add(cache_query_result2);}
                //   omp_unset_lock(&map_locks[map_id]);
                // }
                if (output_num_cache_slots_info && curr_read_hit_cache) {
                  // calculate which map this barcode is in
                  size_t map_id = curr_seed_val % num_locks_for_map;
                
                  // grab lock for this map, and add to the K-MinHash for this particular barcode
                  omp_set_lock(&map_locks[map_id]);
                  auto it = barcode_peak_map[map_id].emplace(curr_seed_val, K_MinHash(k_for_minhash, mapping_parameters_.cache_size)).first;
                  if (cache_query_result1 >= 0) {it->second.add(cache_query_result1);}
                  if (cache_query_result2 >= 0) {it->second.add(cache_query_result2);}
                  omp_unset_lock(&map_locks[map_id]);
                }

                if (pair_index < history_update_threshold) {
                  mm_history1[pair_index].timestamp =
@@ -1284,10 +1343,31 @@ void Chromap::MapPairedEndReads() {
    //  }
    //}
  }
  
  if (mapping_parameters_.mapping_output_format == MAPPINGFORMAT_SAM)
    mapping_writer.AdjustSummaryPairedEndOverCount() ;

  mapping_writer.OutputSummaryMetadata(frip_est_params);
  // Destory the locks used for map
  for (int i = 0; i < num_locks_for_map; ++i) {
    omp_destroy_lock(&map_locks[i]);
  }

  // Add cardinality information to summary metadata
  if (output_num_cache_slots_info) {
    for (auto curr_map: barcode_peak_map) {
      for (auto &pair: curr_map) {
        size_t curr_seed = pair.first;
        size_t est_num_slots = pair.second.compute_cardinality();

        mapping_writer.UpdateSummaryMetadata( 
                          curr_seed,
                          SUMMARY_METADATA_CARDINALITY, 
                          est_num_slots);
      }
    }
  }

  mapping_writer.OutputSummaryMetadata(frip_est_params, output_num_cache_slots_info);
  reference.FinalizeLoading();
  
  std::cerr << "Total time: " << GetRealTime() - real_start_time << "s.\n";
+14 −4
Original line number Diff line number Diff line
@@ -79,10 +79,12 @@ void AddMappingOptions(cxxopts::Options &options) {
                        cxxopts::value<int>(), "INT")
      ("cache-size", "number of cache entries [4000003]", cxxopts::value<int>(), "INT")
      ("cache-update-param", "value used to control number of reads sampled [0.01]", cxxopts::value<double>(), "FLT")
      ("use-all-reads", "use all reads for cache")
      ("use-all-reads", "use all reads for cache (it will slow down execution)")
      ("debug-cache", "verbose output for debugging cache used in chromap")
      ("frip-est-params", "coefficients used for chromap score calculation separated by semi-colons",
      cxxopts::value<std::string>(), "STR");
      ("frip-est-params", "coefficients used for frip est calculation, separated by semi-colons",
      cxxopts::value<std::string>(), "STR")
      ("turn-off-num-uniq-cache-slots", "turn off the output of number of cache slots in summary file")
      ("k-for-minhash", "number of values stored in each MinHash sketch [250]", cxxopts::value<int>(), "INT");
}

void AddInputOptions(cxxopts::Options &options) {
@@ -354,7 +356,15 @@ void ChromapDriver::ParseArgsAndRun(int argc, char *argv[]) {
  if (result.count("frip-est-params")) {
    mapping_parameters.frip_est_params = result["frip-est-params"].as<std::string>();
  }

  if (result.count("turn-off-num-uniq-cache-slots")) {
    mapping_parameters.output_num_uniq_cache_slots = false;
  } 
  if (result.count("k-for-minhash")) {
    mapping_parameters.k_for_minhash = result["k-for-minhash"].as<int>();
    if (mapping_parameters.k_for_minhash < 1 || mapping_parameters.k_for_minhash >= 2000) {
      chromap::ExitWithMessage("Invalid paramter for size of MinHash sketch (--k-for-minhash)");
    }
  }


  if (result.count("min-read-length")) {
+2 −0
Original line number Diff line number Diff line
@@ -29,6 +29,8 @@ struct MappingParameters {
  bool use_all_reads = false;
  bool debug_cache = false;
  std::string frip_est_params = "0.2809;0.8921;7.490e-06;-1.893e-05;-2.178e-05";
  bool output_num_uniq_cache_slots = true;
  int k_for_minhash = 250;

  // Read with # best mappings greater than it will have this number of best
  // mappings reported.
+4 −3
Original line number Diff line number Diff line
@@ -68,7 +68,7 @@ class MappingWriter {
      std::vector<TempMappingFileHandle<MappingRecord>>
          &temp_mapping_file_handles);

  void OutputSummaryMetadata(std::vector<double> frip_est_coeffs = {0.0, 0.0, 0.0, 0.0, 0.0});
  void OutputSummaryMetadata(std::vector<double> frip_est_coeffs = {0.0, 0.0, 0.0, 0.0, 0.0}, bool output_num_cache_slots_info = true);
  void UpdateSummaryMetadata(uint64_t barcode, int type, int change);
  void UpdateSpeicalCategorySummaryMetadata(int category, int type, int change);
  void AdjustSummaryPairedEndOverCount();
@@ -444,13 +444,14 @@ void MappingWriter<MappingRecord>::OutputMappings(
}

template <typename MappingRecord>
void MappingWriter<MappingRecord>::OutputSummaryMetadata(std::vector<double> frip_est_coeffs) {
void MappingWriter<MappingRecord>::OutputSummaryMetadata(std::vector<double> frip_est_coeffs, bool output_num_cache_slots_info) {
  if (!mapping_parameters_.summary_metadata_file_path.empty())
  {
    summary_metadata_.Output(
        mapping_parameters_.summary_metadata_file_path.c_str(),
        !mapping_parameters_.barcode_whitelist_file_path.empty() && !mapping_parameters_.output_mappings_not_in_whitelist,
        frip_est_coeffs
        frip_est_coeffs,
        output_num_cache_slots_info
        );
  }
}
+39 −15
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@ enum SummaryMetadataField {
  SUMMARY_METADATA_MAPPED,
  SUMMARY_METADATA_LOWMAPQ,
	SUMMARY_METADATA_CACHEHIT,
  SUMMARY_METADATA_CARDINALITY,
  SUMMARY_METADATA_FIELDS
};

@@ -42,7 +43,7 @@ class SummaryMetadata {
    kh_destroy(k64_barcode_metadata, barcode_metadata_);
  }

  inline void OutputCounts(const char *barcode, const int *counts, FILE *fp, std::vector<double> frip_est_coeffs)
  inline void OutputCounts(const char *barcode, const int *counts, FILE *fp, std::vector<double> frip_est_coeffs, bool output_num_cache_slots_info)
  {
    // define variables to store values
    size_t num_total = counts[SUMMARY_METADATA_TOTAL];
@@ -55,6 +56,8 @@ class SummaryMetadata {
    size_t num_cachehit = counts[SUMMARY_METADATA_CACHEHIT];
    double fric = (double) num_cachehit / (double) num_mapped;

    size_t num_cache_slots = counts[SUMMARY_METADATA_CARDINALITY];

    // compute the estimated frip
    double est_frip = frip_est_coeffs[0] + /* constant */
                      (frip_est_coeffs[1] * fric) +
@@ -62,7 +65,8 @@ class SummaryMetadata {
                      (frip_est_coeffs[3] * num_unmapped)  +
                      (frip_est_coeffs[4] * num_lowmapq);

    // print barcode as string
    // print out data for current barcode
    if (!output_num_cache_slots_info) {
      fprintf(fp, "%s,%ld,%ld,%ld,%ld,%ld,%.5lf,%.5lf\n", 
              barcode,
              num_total,
@@ -72,11 +76,29 @@ class SummaryMetadata {
              num_cachehit,
              fric,
              est_frip);
    } else {
      fprintf(fp, "%s,%ld,%ld,%ld,%ld,%ld,%.5lf,%.5lf,%ld\n", 
              barcode,
              num_total,
              num_dup,
              num_unmapped,
              num_lowmapq,
              num_cachehit,
              fric,
              est_frip,
              num_cache_slots);
    }
  }

  void Output(const char *filename, bool has_white_list, std::vector<double> frip_est_coeffs) {
  void Output(const char *filename, bool has_white_list, std::vector<double> frip_est_coeffs, bool output_num_cache_slots_info) {
    FILE *fp = fopen(filename, "w");

    // Change summary file header depending on options
    if (!output_num_cache_slots_info)
      fprintf(fp, "barcode,total,duplicate,unmapped,lowmapq,cachehit,fric,estfrip\n");   
    else
      fprintf(fp, "barcode,total,duplicate,unmapped,lowmapq,cachehit,fric,estfrip,numcacheslots\n"); 

    khiter_t k;
    for (k = kh_begin(barcode_metadata_); k != kh_end(barcode_metadata_); ++k)
      if (kh_exist(barcode_metadata_, k)) {
@@ -84,7 +106,8 @@ class SummaryMetadata {
                    Seed2Sequence(kh_key(barcode_metadata_, k), barcode_length_).c_str(),
                    kh_value(barcode_metadata_, k).counts, 
                    fp,
                    frip_est_coeffs
                    frip_est_coeffs,
                    output_num_cache_slots_info
                    );
      }
    if (has_white_list) {
@@ -92,7 +115,8 @@ class SummaryMetadata {
                   "non-whitelist", 
                   nonwhitelist_summary_.counts, 
                   fp,
                   frip_est_coeffs
                   frip_est_coeffs,
                   output_num_cache_slots_info
                   ) ;
    }
    fclose(fp);