Commit 851d1a25 authored by Li's avatar Li Committed by Haowen Zhang
Browse files

Preliminary implementation of barcode translation.

parent 847cdd7c
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -22,7 +22,7 @@ dir:
$(exec): $(objs)
	$(CXX) $(CXXFLAGS) $(objs) -o $(exec) $(LDFLAGS)
	
$(objs_dir)/%.o: $(src_dir)/%.cc
$(objs_dir)/%.o: $(src_dir)/%.cc $(src_dir)/%.h
	$(CXX) $(CXXFLAGS) -c $< -o $@

.PHONY: clean
+8 −1
Original line number Diff line number Diff line
@@ -5912,6 +5912,9 @@ void ChromapDriver::ParseArgsAndRun(int argc, char *argv[]) {
          "Output mappings in pairs format (defined by 4DN for HiC data)")(
          "pairs-natural-chr-order",
          "natural chromosome order for pairs flipping",
          cxxopts::value<std::string>(), "FILE")
          ("barcode-translate", 
           "Convert barcode to the specified sequences during output",
           cxxopts::value<std::string>(), "FILE");
  //("PAF", "Output mappings in PAF format (only for test)");
  options.add_options()("v,version", "Print version")("h,help", "Print help");
@@ -6216,6 +6219,10 @@ void ChromapDriver::ParseArgsAndRun(int argc, char *argv[]) {
          result["pairs-natural-chr-order"].as<std::string>();
    }

    if (result.count("barcode-translate")) {
      mapping_parameters.barcode_translate_table_path = result["barcode-translate"].as<std::string>();
    }

    if (result.count("skip-barcode-check")) {
      mapping_parameters.skip_barcode_check = true;
    }
+5 −0
Original line number Diff line number Diff line
@@ -119,6 +119,7 @@ struct MappingParameters {
  std::string matrix_output_prefix;
  std::string custom_rid_order_path;
  std::string pairs_custom_rid_order_path;
  std::string barcode_translate_table_path;
  bool skip_barcode_check = false;
};

@@ -220,6 +221,10 @@ class Chromap {
    }

    ParseReadFormat(mapping_parameters.read_format);
    if (mapping_parameters.barcode_translate_table_path.length() > 0) {
      std::string tmp = mapping_parameters.barcode_translate_table_path;
      output_tools_.SetBarcodeTranslateTable(tmp);
    }
  }

  ~Chromap() {
+3 −3
Original line number Diff line number Diff line
@@ -19,7 +19,7 @@ void OutputTools<MappingWithBarcode>::AppendMapping(
        std::string(reference_sequence_name) + "\t" +
        std::to_string(mapping.GetStartPosition()) + "\t" +
        std::to_string(mapping_end_position) + "\t" +
        Seed2Sequence(mapping.cell_barcode_, cell_barcode_length_) + "\n");
        barcode_translator.Translate(mapping.cell_barcode_, cell_barcode_length_) + "\n");
  } else {
    std::string strand = mapping.IsPositiveStrand() ? "+" : "-";
    const char *reference_sequence_name = reference.GetSequenceNameAt(rid);
@@ -128,7 +128,7 @@ void OutputTools<PairedEndMappingWithBarcode>::AppendMapping(
        std::string(reference_sequence_name) + "\t" +
        std::to_string(mapping.GetStartPosition()) + "\t" +
        std::to_string(mapping_end_position) + "\t" +
        Seed2Sequence(mapping.cell_barcode_, cell_barcode_length_) + "\t" +
        barcode_translator.Translate(mapping.cell_barcode_, cell_barcode_length_) + "\t" +
        std::to_string(mapping.num_dups_) + "\n");
  } else {
    bool positive_strand = mapping.IsPositiveStrand();
@@ -338,7 +338,7 @@ void OutputTools<SAMMapping>::AppendMapping(uint32_t rid,
      "\tMD:Z:" + mapping.MD_);
  if (cell_barcode_length_ > 0) {
    this->AppendMappingOutput(
        "\tCB:Z:" + Seed2Sequence(mapping.cell_barcode_, cell_barcode_length_));
        "\tCB:Z:" + barcode_translator.Translate(mapping.cell_barcode_, cell_barcode_length_));
  }
  this->AppendMappingOutput("\n");
}
+95 −1
Original line number Diff line number Diff line
@@ -6,6 +6,7 @@
#include <cinttypes>
#include <cstring>
#include <functional>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
@@ -16,9 +17,12 @@
#include "pairs_mapping.h"
#include "sam_mapping.h"
#include "sequence_batch.h"
#include "khash.h"

namespace chromap {

KHASH_INIT(k64_str, uint64_t, std::string, 1, kh_int64_hash_func, kh_int64_hash_equal);

enum MappingOutputFormat {
  MAPPINGFORMAT_UNKNOWN,
  MAPPINGFORMAT_BED,
@@ -34,6 +38,90 @@ bool ReadIdLess(const std::pair<uint32_t, MappingRecord> &a,
  return a.second.read_id_ < b.second.read_id_;
}

// The class for handling barcode convertion.
class BarcodeTranslator
{
private:
  khash_t(k64_str) *barcode_translate_table;
  int from_bc_length;
  
  std::string Seed2Sequence(uint64_t seed, uint32_t seed_length) const {
    std::string sequence;
    sequence.reserve(seed_length);
    uint64_t mask = 3;
    for (uint32_t i = 0; i < seed_length; ++i) {
      sequence.push_back(SequenceBatch::Uint8ToChar(
          (seed >> ((seed_length - 1 - i) * 2)) & mask));
    }
    return sequence;
  }
  
  void ProcessTranslateFileLine(std::string &line) {
    int i;
    int len = line.length();
    std::string to;
    for (i = 0; i < len; ++i) {
      if (line[i] == ',' || line[i] == '\t')
        break;
    }

    to = line.substr(0, i);
    //from = line.substr(i + 1, len - i - 1);
    from_bc_length = len - i - 1;
    uint64_t from_seed = SequenceBatch::GenerateSeedFromSequence(line.c_str(), len, i + 1, from_bc_length);
    
    int khash_return_code;
    khiter_t barcode_translate_table_iter = kh_put(k64_str, barcode_translate_table, from_seed, &khash_return_code);
    kh_value(barcode_translate_table, barcode_translate_table_iter) = to;
  }

public:
  BarcodeTranslator() {
    barcode_translate_table = NULL;
    from_bc_length = -1;
  }
  
  ~BarcodeTranslator() {
    if (barcode_translate_table != NULL) {
      kh_destroy(k64_str, barcode_translate_table);
    }
  }

  void SetTranslateTable(const std::string &file) {
    barcode_translate_table = kh_init(k64_str);
    std::ifstream file_stream(file);
    std::string file_line;

    while (getline(file_stream, file_line)) {
      ProcessTranslateFileLine(file_line); 
    }
  }
  
  std::string Translate(uint64_t bc, uint32_t bc_length) {
    if (barcode_translate_table == NULL) {
      return Seed2Sequence(bc, bc_length);
    } 

    std::string ret;  
    uint64_t i;
    for (i = 0; i < bc_length / from_bc_length; ++i) {
      uint64_t seed = (bc << (i * from_bc_length)) >> (bc_length - from_bc_length);
      khiter_t barcode_translate_table_iter = kh_get(k64_str, barcode_translate_table, seed);
      if (barcode_translate_table_iter == kh_end(barcode_translate_table)) {
        std::cerr << "Barcode does not exist in the translation table.\n" << std::endl;
        exit(-1);
      }
      const std::string &bc_to = kh_value(barcode_translate_table, barcode_translate_table_iter);
      if (i == 0) {
        ret = bc_to;
      } else {
        ret += "-" + bc_to;
      }
    }
    return ret;
  }
};

template <typename MappingRecord>
class OutputTools {
 public:
@@ -135,7 +223,8 @@ class OutputTools {

  void AppendBarcodeOutput(uint64_t barcode_key) {
    fprintf(barcode_output_file_, "%s-1\n",
            Seed2Sequence(barcode_key, cell_barcode_length_).data());
            barcode_translator.Translate(barcode_key, cell_barcode_length_).data());
            //Seed2Sequence(barcode_key, cell_barcode_length_).data());
  }

  void WriteMatrixOutputHead(uint64_t num_peaks, uint64_t num_barcodes,
@@ -160,6 +249,10 @@ class OutputTools {
    custom_rid_rank_ = custom_rid_rank;
  }

  inline void SetBarcodeTranslateTable(std::string &file) {
    barcode_translator.SetTranslateTable(file);
  }

  std::vector<int> custom_rid_rank_;  // for pairs
 protected:
  std::string mapping_output_file_path_;
@@ -173,6 +266,7 @@ class OutputTools {
  FILE *peak_output_file_;
  FILE *barcode_output_file_;
  FILE *matrix_output_file_;
  BarcodeTranslator barcode_translator;
};

// Specialization for BED format.