Commit d26c0d4c authored by Li's avatar Li Committed by Haowen Zhang
Browse files

Code refactoring

parent c1f625c9
Loading
Loading
Loading
Loading
+111 −0
Original line number Diff line number Diff line
#include <cinttypes>
#include <cstring>
#include <functional>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>

#include "khash.h"

namespace chromap {

KHASH_INIT(k64_str, uint64_t, char *, 1, kh_int64_hash_func, kh_int64_hash_equal);

// The class for handling barcode convertion.
class BarcodeTranslator
{
private:
  khash_t(k64_str) *barcode_translate_table;
  int from_bc_length;
  uint64_t mask;

  std::string Seed2Sequence(uint64_t seed, uint32_t seed_length) const {
    std::string sequence;
    sequence.reserve(seed_length);
    uint64_t mask = 3;
    for (uint32_t i = 0; i < seed_length; ++i) {
      sequence.push_back(SequenceBatch::Uint8ToChar(
          (seed >> ((seed_length - 1 - i) * 2)) & mask));
    }
    return sequence;
  }
  
  void ProcessTranslateFileLine(std::string &line) {
    int i;
    int len = line.length();
    std::string to;
    for (i = 0; i < len; ++i) {
      if (line[i] == ',' || line[i] == '\t')
        break;
    }

    to = line.substr(0, i);
    //from = line.substr(i + 1, len - i - 1);
    from_bc_length = len - i - 1;
    uint64_t from_seed = SequenceBatch::GenerateSeedFromSequence(line.c_str(), len, i + 1, from_bc_length);
    
    int khash_return_code;
    khiter_t barcode_translate_table_iter = kh_put(k64_str, barcode_translate_table, from_seed, &khash_return_code);
    kh_value(barcode_translate_table, barcode_translate_table_iter) = strdup(to.c_str());
  }

public:
  BarcodeTranslator() {
    barcode_translate_table = NULL;
    from_bc_length = -1;
  }
  
  ~BarcodeTranslator() {
    if (barcode_translate_table != NULL) {
      khiter_t k ;
      for (k = kh_begin(barcode_translate_table) ; k != kh_end(barcode_translate_table); ++k)
      {
        if (kh_exist(barcode_translate_table, k))
          free(kh_value(barcode_translate_table, k)) ;
      }
      kh_destroy(k64_str, barcode_translate_table);
    }
  }

  void SetTranslateTable(const std::string &file) {
    barcode_translate_table = kh_init(k64_str);
    std::ifstream file_stream(file);
    std::string file_line;
    while (getline(file_stream, file_line)) {
      ProcessTranslateFileLine(file_line); 
    }
    
    mask = (1ull<<(2*from_bc_length)) - 1;
    /*for (int i = 0; i < from_bc_length; ++i)
    {
      mask |= (3ull << (2*i));
    }*/
  }
  
  std::string Translate(uint64_t bc, uint32_t bc_length) {
    if (barcode_translate_table == NULL) {
      return Seed2Sequence(bc, bc_length);
    } 

    std::string ret;  
    uint64_t i;
    for (i = 0; i < bc_length / from_bc_length; ++i) {
      uint64_t seed = (bc << (2 * i * from_bc_length)) >> (2 * (bc_length / from_bc_length - 1) * from_bc_length);
      seed &= mask;
      khiter_t barcode_translate_table_iter = kh_get(k64_str, barcode_translate_table, seed);
      if (barcode_translate_table_iter == kh_end(barcode_translate_table)) {
        std::cerr << "Barcode does not exist in the translation table.\n" << std::endl;
        exit(-1);
      }
      std::string bc_to(kh_value(barcode_translate_table, barcode_translate_table_iter));
      if (i == 0) {
        ret = bc_to;
      } else {
        ret += "-" + bc_to;
      }
    }
    return ret;
  }
};
}
+0 −2
Original line number Diff line number Diff line
@@ -2998,8 +2998,6 @@ void Chromap<MappingRecord>::MapSingleEndReads() {
  }

  index.Destroy();
  //OutputMappingStatistics(num_reference_sequences, mappings_on_diff_ref_seqs_,
  //                        mappings_on_diff_ref_seqs_);

  if (Tn5_shift_) {
    ApplyTn5ShiftOnSingleEndMapping(num_reference_sequences,
+3 −3
Original line number Diff line number Diff line
@@ -19,7 +19,7 @@ void OutputTools<MappingWithBarcode>::AppendMapping(
        std::string(reference_sequence_name) + "\t" +
        std::to_string(mapping.GetStartPosition()) + "\t" +
        std::to_string(mapping_end_position) + "\t" +
        barcode_translator.Translate(mapping.cell_barcode_, cell_barcode_length_) + 
        barcode_translator_.Translate(mapping.cell_barcode_, cell_barcode_length_) + 
        "\t" + std::to_string(mapping.num_dups_) + "\n");
  } else {
    std::string strand = mapping.IsPositiveStrand() ? "+" : "-";
@@ -129,7 +129,7 @@ void OutputTools<PairedEndMappingWithBarcode>::AppendMapping(
        std::string(reference_sequence_name) + "\t" +
        std::to_string(mapping.GetStartPosition()) + "\t" +
        std::to_string(mapping_end_position) + "\t" +
        barcode_translator.Translate(mapping.cell_barcode_, cell_barcode_length_) + "\t" +
        barcode_translator_.Translate(mapping.cell_barcode_, cell_barcode_length_) + "\t" +
        std::to_string(mapping.num_dups_) + "\n");
  } else {
    bool positive_strand = mapping.IsPositiveStrand();
@@ -339,7 +339,7 @@ void OutputTools<SAMMapping>::AppendMapping(uint32_t rid,
      "\tMD:Z:" + mapping.MD_);
  if (cell_barcode_length_ > 0) {
    this->AppendMappingOutput(
        "\tCB:Z:" + barcode_translator.Translate(mapping.cell_barcode_, cell_barcode_length_));
        "\tCB:Z:" + barcode_translator_.Translate(mapping.cell_barcode_, cell_barcode_length_));
  }
  this->AppendMappingOutput("\n");
}
+4 −102
Original line number Diff line number Diff line
@@ -18,11 +18,10 @@
#include "sam_mapping.h"
#include "sequence_batch.h"
#include "khash.h"
#include "barcode_translator.h"

namespace chromap {

KHASH_INIT(k64_str, uint64_t, char *, 1, kh_int64_hash_func, kh_int64_hash_equal);

enum MappingOutputFormat {
  MAPPINGFORMAT_UNKNOWN,
  MAPPINGFORMAT_BED,
@@ -38,103 +37,6 @@ bool ReadIdLess(const std::pair<uint32_t, MappingRecord> &a,
  return a.second.read_id_ < b.second.read_id_;
}

// The class for handling barcode convertion.
class BarcodeTranslator
{
private:
  khash_t(k64_str) *barcode_translate_table;
  int from_bc_length;
  uint64_t mask;

  std::string Seed2Sequence(uint64_t seed, uint32_t seed_length) const {
    std::string sequence;
    sequence.reserve(seed_length);
    uint64_t mask = 3;
    for (uint32_t i = 0; i < seed_length; ++i) {
      sequence.push_back(SequenceBatch::Uint8ToChar(
          (seed >> ((seed_length - 1 - i) * 2)) & mask));
    }
    return sequence;
  }
  
  void ProcessTranslateFileLine(std::string &line) {
    int i;
    int len = line.length();
    std::string to;
    for (i = 0; i < len; ++i) {
      if (line[i] == ',' || line[i] == '\t')
        break;
    }

    to = line.substr(0, i);
    //from = line.substr(i + 1, len - i - 1);
    from_bc_length = len - i - 1;
    uint64_t from_seed = SequenceBatch::GenerateSeedFromSequence(line.c_str(), len, i + 1, from_bc_length);
    
    int khash_return_code;
    khiter_t barcode_translate_table_iter = kh_put(k64_str, barcode_translate_table, from_seed, &khash_return_code);
    kh_value(barcode_translate_table, barcode_translate_table_iter) = strdup(to.c_str());
  }

public:
  BarcodeTranslator() {
    barcode_translate_table = NULL;
    from_bc_length = -1;
  }
  
  ~BarcodeTranslator() {
    if (barcode_translate_table != NULL) {
      khiter_t k ;
      for (k = kh_begin(barcode_translate_table) ; k != kh_end(barcode_translate_table); ++k)
      {
        if (kh_exist(barcode_translate_table, k))
          free(kh_value(barcode_translate_table, k)) ;
      }
      kh_destroy(k64_str, barcode_translate_table);
    }
  }

  void SetTranslateTable(const std::string &file) {
    barcode_translate_table = kh_init(k64_str);
    std::ifstream file_stream(file);
    std::string file_line;
    while (getline(file_stream, file_line)) {
      ProcessTranslateFileLine(file_line); 
    }
    
    mask = (1ull<<(2*from_bc_length)) - 1;
    /*for (int i = 0; i < from_bc_length; ++i)
    {
      mask |= (3ull << (2*i));
    }*/
  }
  
  std::string Translate(uint64_t bc, uint32_t bc_length) {
    if (barcode_translate_table == NULL) {
      return Seed2Sequence(bc, bc_length);
    } 

    std::string ret;  
    uint64_t i;
    for (i = 0; i < bc_length / from_bc_length; ++i) {
      uint64_t seed = (bc << (2 * i * from_bc_length)) >> (2 * (bc_length / from_bc_length - 1) * from_bc_length);
      seed &= mask;
      khiter_t barcode_translate_table_iter = kh_get(k64_str, barcode_translate_table, seed);
      if (barcode_translate_table_iter == kh_end(barcode_translate_table)) {
        std::cerr << "Barcode does not exist in the translation table.\n" << std::endl;
        exit(-1);
      }
      std::string bc_to(kh_value(barcode_translate_table, barcode_translate_table_iter));
      if (i == 0) {
        ret = bc_to;
      } else {
        ret += "-" + bc_to;
      }
    }
    return ret;
  }
};

template <typename MappingRecord>
class OutputTools {
 public:
@@ -236,7 +138,7 @@ class OutputTools {

  void AppendBarcodeOutput(uint64_t barcode_key) {
    fprintf(barcode_output_file_, "%s-1\n",
            barcode_translator.Translate(barcode_key, cell_barcode_length_).data());
            barcode_translator_.Translate(barcode_key, cell_barcode_length_).data());
            //Seed2Sequence(barcode_key, cell_barcode_length_).data());
  }

@@ -263,7 +165,7 @@ class OutputTools {
  }

  inline void SetBarcodeTranslateTable(std::string &file) {
    barcode_translator.SetTranslateTable(file);
    barcode_translator_.SetTranslateTable(file);
  }

  std::vector<int> custom_rid_rank_;  // for pairs
@@ -279,7 +181,7 @@ class OutputTools {
  FILE *peak_output_file_;
  FILE *barcode_output_file_;
  FILE *matrix_output_file_;
  BarcodeTranslator barcode_translator;
  BarcodeTranslator barcode_translator_;
};

// Specialization for BED format.