Commit 4237c1c0 authored by Haowen Zhang's avatar Haowen Zhang
Browse files

Clean up the code for generating custom rid rank.

parent 560e286a
Loading
Loading
Loading
Loading
+25 −26
Original line number Diff line number Diff line
@@ -717,46 +717,45 @@ void Chromap::ParseReadFormat(const std::string &read_format) {
    memcpy(barcode_format_, fields, sizeof(fields));
}

void Chromap::GenerateCustomizedRidRank(const std::string &rid_order_path,
                                        uint32_t num_reference_sequences,
                                        const SequenceBatch &reference,
                                        std::vector<int> &rid_rank) {
  rid_rank.resize(num_reference_sequences);
void Chromap::GenerateCustomRidRanks(
    const std::string &custom_rid_order_file_path,
    uint32_t num_reference_sequences, const SequenceBatch &reference,
    std::vector<int> &rid_ranks) {
  for (uint32_t i = 0; i < num_reference_sequences; ++i) {
    rid_rank[i] = i;
    rid_ranks.emplace_back(i);
  }

  if (rid_order_path.length() == 0) {
  if (custom_rid_order_file_path.empty()) {
    return;
  }

  std::unordered_map<std::string, int> rname_to_rank;
  std::ifstream file_stream(rid_order_path);
  std::string line;
  uint32_t i = 0;
  while (getline(file_stream, line)) {
    rname_to_rank[line] = i;
    i += 1;
  std::unordered_map<std::string, int> ref_name_to_rank;
  std::ifstream custom_rid_order_file_stream(custom_rid_order_file_path);
  std::string ref_name;
  uint32_t ref_rank = 0;
  while (getline(custom_rid_order_file_stream, ref_name)) {
    ref_name_to_rank[ref_name] = ref_rank;
    ref_rank += 1;
  }
  file_stream.close();
  custom_rid_order_file_stream.close();

  // First put the chrosomes in the list provided by user.
  // First, rank the chromosomes in the custom order provided by users.
  for (uint32_t i = 0; i < num_reference_sequences; ++i) {
    std::string rname(reference.GetSequenceNameAt(i));
    if (rname_to_rank.find(rname) != rname_to_rank.end()) {
      rid_rank[i] = rname_to_rank[rname];
    std::string ref_name(reference.GetSequenceNameAt(i));
    if (ref_name_to_rank.find(ref_name) != ref_name_to_rank.end()) {
      rid_ranks[i] = ref_name_to_rank[ref_name];
    } else {
      rid_rank[i] = -1;
      rid_ranks[i] = -1;
    }
  }

  // There might be some rank without any rid associated with. This helps if
  // cutstom list contains rid not in the reference.
  uint32_t k = rname_to_rank.size();
  // Put the remaining chrosomes
  // There might be some rids without any custom order. We just order them based
  // on their original order in the reference file.
  uint32_t k = ref_name_to_rank.size();
  // Rank the remaining chromosomes.
  for (uint32_t i = 0; i < num_reference_sequences; ++i) {
    if (rid_rank[i] == -1) {
      rid_rank[i] = k;
    if (rid_ranks[i] == -1) {
      rid_ranks[i] = k;
      ++k;
    }
  }
+22 −18
Original line number Diff line number Diff line
@@ -109,10 +109,16 @@ class Chromap {

  void ParseReadFormat(const std::string &read_format);

  void GenerateCustomizedRidRank(const std::string &rid_order_path,
  // User custom rid order file contains a column of reference sequence names
  // and there is one name on each row. The reference sequence name on the ith
  // row means the rank of this sequence is i. This function loads the custom
  // rid order file and generates a mapping from the original rids to their
  // custom ranks, e.g., rid_ranks[i] is the custom rank of the ith rid in the
  // reference.
  void GenerateCustomRidRanks(const std::string &custom_rid_order_file_path,
                              uint32_t num_reference_sequences,
                              const SequenceBatch &reference,
                                 std::vector<int> &rid_rank);
                              std::vector<int> &rid_ranks);

  // TODO: generate reranked candidates directly.
  void RerankCandidatesRid(std::vector<Candidate> &candidates);
@@ -167,7 +173,7 @@ void Chromap::MapSingleEndReads() {
  reference.InitializeLoading(mapping_parameters_.reference_file_path);
  uint32_t num_reference_sequences = reference.LoadAllSequences();
  if (mapping_parameters_.custom_rid_order_path.length() > 0) {
    GenerateCustomizedRidRank(mapping_parameters_.custom_rid_order_path,
    GenerateCustomRidRanks(mapping_parameters_.custom_rid_order_path,
                           num_reference_sequences, reference,
                           custom_rid_rank_);
    reference.ReorderSequences(custom_rid_rank_);
@@ -217,9 +223,8 @@ void Chromap::MapSingleEndReads() {
  MappingGenerator<MappingRecord> mapping_generator(mapping_parameters_,
                                                    pairs_custom_rid_rank_);

  MappingWriter<MappingRecord> mapping_writer(mapping_parameters_,
                                              barcode_length_,
                                              pairs_custom_rid_rank_);
  MappingWriter<MappingRecord> mapping_writer(
      mapping_parameters_, barcode_length_, pairs_custom_rid_rank_);

  mapping_writer.OutputHeader(num_reference_sequences, reference);

@@ -537,13 +542,13 @@ void Chromap::MapPairedEndReads() {
  reference.InitializeLoading(mapping_parameters_.reference_file_path);
  uint32_t num_reference_sequences = reference.LoadAllSequences();
  if (mapping_parameters_.custom_rid_order_path.length() > 0) {
    GenerateCustomizedRidRank(mapping_parameters_.custom_rid_order_path,
    GenerateCustomRidRanks(mapping_parameters_.custom_rid_order_path,
                           num_reference_sequences, reference,
                           custom_rid_rank_);
    reference.ReorderSequences(custom_rid_rank_);
  }
  if (mapping_parameters_.mapping_output_format == MAPPINGFORMAT_PAIRS) {
    GenerateCustomizedRidRank(mapping_parameters_.pairs_custom_rid_order_path,
    GenerateCustomRidRanks(mapping_parameters_.pairs_custom_rid_order_path,
                           num_reference_sequences, reference,
                           pairs_custom_rid_rank_);
  }
@@ -608,8 +613,7 @@ void Chromap::MapPairedEndReads() {
                                                    pairs_custom_rid_rank_);

  MappingWriter<MappingRecord> mapping_writer(
      mapping_parameters_, barcode_length_,
      pairs_custom_rid_rank_);
      mapping_parameters_, barcode_length_, pairs_custom_rid_rank_);
  mapping_writer.OutputHeader(num_reference_sequences, reference);

  uint32_t num_mappings_in_mem = 0;