Unverified Commit 1daabe5c authored by Haowen Zhang's avatar Haowen Zhang Committed by GitHub
Browse files

Merge pull request #25 from haowenz/custom_read_format

Custom read format
parents 957a92b4 b956460c
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -115,6 +115,13 @@ threshold to make a correction by setting **--bc-probability-threshold**
corrections. For scATAC-seq data with multiple read and barcode files, you can
use "," to concatenate multiple input files as the example [above](#general). 

Chromap also supports user-defined barcode format, including mixed barcode and genomic 
data case. User can specify the sequence structure through option **--read-format**. The value
is comma-separated string, each field is also semi-comma-splitted string: [r1|r2|bc]:start:end.
The start and end(inclusive, -1 means to the read end). For the example that the barcode is in read1's 
first 16bp, one can use the option 
`-1 read1.fq.gz -2 read2.fq.gz --barcode read1.fq.gz --read-format bc:0:15,r1:16:-1`

The BED format (fragment file) for bulk and single-cell is different except for the first
three columns. For bulk data, the columns are

+67 −50

File changed.

Preview size limit exceeded, changes collapsed.

+84 −17
Original line number Diff line number Diff line
@@ -45,7 +45,7 @@ struct Peak {
};

KHASH_MAP_INIT_INT64(k128, uint128_t);
KHASH_MAP_INIT_INT(k32, uint32_t);
KHASH_MAP_INIT_INT64(k64_seq, uint64_t);
KHASH_SET_INIT_INT(k32_set);
KHASH_MAP_INIT_INT64(kmatrix, uint32_t);

@@ -82,31 +82,95 @@ class Chromap {
  }

  // For mapping
  Chromap(int error_threshold, int match_score, int mismatch_penalty, const std::vector<int> &gap_open_penalties, const std::vector<int> &gap_extension_penalties, int min_num_seeds_required_for_mapping, const std::vector<int> &max_seed_frequencies, int max_num_best_mappings, int max_insert_size, uint8_t mapq_threshold, int num_threads, int min_read_length, int barcode_correction_error_threshold, double barcode_correction_probability_threshold, int multi_mapping_allocation_distance, int multi_mapping_allocation_seed, int drop_repetitive_reads, bool trim_adapters, bool remove_pcr_duplicates, bool remove_pcr_duplicates_at_bulk_level, bool is_bulk_data, bool allocate_multi_mappings, bool only_output_unique_mappings, bool output_mappings_not_in_whitelist, bool Tn5_shift, bool split_alignment, bool output_mapping_in_BED, bool output_mapping_in_TagAlign, bool output_mapping_in_PAF, bool output_mapping_in_SAM, bool output_mapping_in_pairs, bool low_memory_mode, bool cell_by_bin, int bin_size, uint16_t depth_cutoff_to_call_peak, int peak_min_length, int peak_merge_max_length, const std::string &reference_file_path, const std::string &index_file_path, const std::vector<std::string> &read_file1_paths, const std::vector<std::string> &read_file2_paths, const std::vector<std::string> &barcode_file_paths, const std::string &barcode_whitelist_file_path, const std::string &mapping_output_file_path, const std::string &matrix_output_prefix, const std::string &custom_rid_order_path, const std::string &pairs_custom_rid_order_path) : error_threshold_(error_threshold), match_score_(match_score), mismatch_penalty_(mismatch_penalty), gap_open_penalties_(gap_open_penalties), gap_extension_penalties_(gap_extension_penalties), min_num_seeds_required_for_mapping_(min_num_seeds_required_for_mapping), max_seed_frequencies_(max_seed_frequencies), max_num_best_mappings_(max_num_best_mappings), max_insert_size_(max_insert_size), mapq_threshold_(mapq_threshold), num_threads_(num_threads), min_read_length_(min_read_length), barcode_correction_error_threshold_(barcode_correction_error_threshold), barcode_correction_probability_threshold_(barcode_correction_probability_threshold), multi_mapping_allocation_distance_(multi_mapping_allocation_distance), multi_mapping_allocation_seed_(multi_mapping_allocation_seed), drop_repetitive_reads_(drop_repetitive_reads), trim_adapters_(trim_adapters), remove_pcr_duplicates_(remove_pcr_duplicates), remove_pcr_duplicates_at_bulk_level_(remove_pcr_duplicates_at_bulk_level), is_bulk_data_(is_bulk_data), allocate_multi_mappings_(allocate_multi_mappings), only_output_unique_mappings_(only_output_unique_mappings), output_mappings_not_in_whitelist_(output_mappings_not_in_whitelist), Tn5_shift_(Tn5_shift), split_alignment_(split_alignment), output_mapping_in_BED_(output_mapping_in_BED), output_mapping_in_TagAlign_(output_mapping_in_TagAlign), output_mapping_in_PAF_(output_mapping_in_PAF), output_mapping_in_SAM_(output_mapping_in_SAM), output_mapping_in_pairs_(output_mapping_in_pairs), low_memory_mode_(low_memory_mode), cell_by_bin_(cell_by_bin), bin_size_(bin_size), depth_cutoff_to_call_peak_(depth_cutoff_to_call_peak), peak_min_length_(peak_min_length), peak_merge_max_length_(peak_merge_max_length), reference_file_path_(reference_file_path), index_file_path_(index_file_path), read_file1_paths_(read_file1_paths), read_file2_paths_(read_file2_paths), barcode_file_paths_(barcode_file_paths), barcode_whitelist_file_path_(barcode_whitelist_file_path), mapping_output_file_path_(mapping_output_file_path), matrix_output_prefix_(matrix_output_prefix), custom_rid_order_path_(custom_rid_order_path), pairs_custom_rid_order_path_(pairs_custom_rid_order_path) {
    barcode_lookup_table_ = kh_init(k32);
    barcode_whitelist_lookup_table_ = kh_init(k32);
    barcode_histogram_ = kh_init(k32);
    barcode_index_table_ = kh_init(k32);
  Chromap(int error_threshold, int match_score, int mismatch_penalty, const std::vector<int> &gap_open_penalties, const std::vector<int> &gap_extension_penalties, int min_num_seeds_required_for_mapping, const std::vector<int> &max_seed_frequencies, int max_num_best_mappings, int max_insert_size, uint8_t mapq_threshold, int num_threads, int min_read_length, int barcode_correction_error_threshold, double barcode_correction_probability_threshold, int multi_mapping_allocation_distance, int multi_mapping_allocation_seed, int drop_repetitive_reads, bool trim_adapters, bool remove_pcr_duplicates, bool remove_pcr_duplicates_at_bulk_level, bool is_bulk_data, bool allocate_multi_mappings, bool only_output_unique_mappings, bool output_mappings_not_in_whitelist, bool Tn5_shift, bool split_alignment, bool output_mapping_in_BED, bool output_mapping_in_TagAlign, bool output_mapping_in_PAF, bool output_mapping_in_SAM, bool output_mapping_in_pairs, bool low_memory_mode, bool cell_by_bin, int bin_size, uint16_t depth_cutoff_to_call_peak, int peak_min_length, int peak_merge_max_length, const std::string &reference_file_path, const std::string &index_file_path, const std::vector<std::string> &read_file1_paths, const std::vector<std::string> &read_file2_paths, const std::vector<std::string> &barcode_file_paths, const std::string &barcode_whitelist_file_path, const std::string &read_format, const std::string &mapping_output_file_path, const std::string &matrix_output_prefix, const std::string &custom_rid_order_path, const std::string &pairs_custom_rid_order_path) : error_threshold_(error_threshold), match_score_(match_score), mismatch_penalty_(mismatch_penalty), gap_open_penalties_(gap_open_penalties), gap_extension_penalties_(gap_extension_penalties), min_num_seeds_required_for_mapping_(min_num_seeds_required_for_mapping), max_seed_frequencies_(max_seed_frequencies), max_num_best_mappings_(max_num_best_mappings), max_insert_size_(max_insert_size), mapq_threshold_(mapq_threshold), num_threads_(num_threads), min_read_length_(min_read_length), barcode_correction_error_threshold_(barcode_correction_error_threshold), barcode_correction_probability_threshold_(barcode_correction_probability_threshold), multi_mapping_allocation_distance_(multi_mapping_allocation_distance), multi_mapping_allocation_seed_(multi_mapping_allocation_seed), drop_repetitive_reads_(drop_repetitive_reads), trim_adapters_(trim_adapters), remove_pcr_duplicates_(remove_pcr_duplicates), remove_pcr_duplicates_at_bulk_level_(remove_pcr_duplicates_at_bulk_level), is_bulk_data_(is_bulk_data), allocate_multi_mappings_(allocate_multi_mappings), only_output_unique_mappings_(only_output_unique_mappings), output_mappings_not_in_whitelist_(output_mappings_not_in_whitelist), Tn5_shift_(Tn5_shift), split_alignment_(split_alignment), output_mapping_in_BED_(output_mapping_in_BED), output_mapping_in_TagAlign_(output_mapping_in_TagAlign), output_mapping_in_PAF_(output_mapping_in_PAF), output_mapping_in_SAM_(output_mapping_in_SAM), output_mapping_in_pairs_(output_mapping_in_pairs), low_memory_mode_(low_memory_mode), cell_by_bin_(cell_by_bin), bin_size_(bin_size), depth_cutoff_to_call_peak_(depth_cutoff_to_call_peak), peak_min_length_(peak_min_length), peak_merge_max_length_(peak_merge_max_length), reference_file_path_(reference_file_path), index_file_path_(index_file_path), read_file1_paths_(read_file1_paths), read_file2_paths_(read_file2_paths), barcode_file_paths_(barcode_file_paths), barcode_whitelist_file_path_(barcode_whitelist_file_path), mapping_output_file_path_(mapping_output_file_path), matrix_output_prefix_(matrix_output_prefix), custom_rid_order_path_(custom_rid_order_path), pairs_custom_rid_order_path_(pairs_custom_rid_order_path) {
    barcode_lookup_table_ = kh_init(k64_seq);
    barcode_whitelist_lookup_table_ = kh_init(k64_seq);
    barcode_histogram_ = kh_init(k64_seq);
    barcode_index_table_ = kh_init(k64_seq);
    NUM_VPU_LANES_ = 0;
    if (error_threshold_ < 8) {
      NUM_VPU_LANES_ = 8;
    } else if (error_threshold_ < 16) {
      NUM_VPU_LANES_ = 4;
    }

    // Parse read format
    uint32_t i ; 
    int j = 0;
    int k = 0; // for read1, read2, or barcode
    read1_format_[0] = 0; read1_format_[1] = -1 ;
    read2_format_[0] = 0; read2_format_[1] = -1 ;
    barcode_format_[0] = 0; barcode_format_[1] = -1 ;
    int fields[2] = {0, -1} ;
		char buffer[20] ;
		int blen = 0;
    for (i = 0; i < read_format.size(); ++i) {
      if (read_format[i] == ',' || i == 0) {
				if (i > 0) {
					buffer[blen] = '\0';
					fields[j] = atoi(buffer);

					if (k == 0)
						memcpy(read1_format_, fields, sizeof(fields));
					else if (k == 1)
						memcpy(read2_format_, fields, sizeof(fields));
					else
						memcpy(barcode_format_, fields, sizeof(fields));

					++i;
				}
        if (read_format[i] == 'r' && read_format[i + 1] == '1') 
          k = 0;
        else if (read_format[i] == 'r' && read_format[i + 1] == '2') 
          k = 1;
        else if (read_format[i] == 'b' && read_format[i + 1] == 'c')
          k = 2;
        else
          ExitWithMessage("Unknown read format: " + read_format + "\n");
        j = 0;
				fields[0] = fields[1] = 0;
				i += 2;
				blen = 0;
      } else {
        if (read_format[i] != ':') {
          if (j < 2) {
						buffer[blen] = read_format[i] ;
						++blen;
          }
					else {
						ExitWithMessage("Unknown read format: " + read_format + "\n");
          }
        } else {
					buffer[blen] = '\0';
					fields[j] = atoi(buffer);
          ++j ;
					blen = 0;
        }
      }
    }
		buffer[blen] = '\0';
		fields[j] = atoi(buffer);
    // By initialization, it is fine even if there is no read_format specified  
    if (k == 0)
      memcpy(read1_format_, fields, sizeof(fields));
    else if (k == 1)
      memcpy(read2_format_, fields, sizeof(fields));
    else
      memcpy(barcode_format_, fields, sizeof(fields));
  }

  ~Chromap(){
    if (barcode_whitelist_lookup_table_ != NULL) {
      kh_destroy(k32, barcode_whitelist_lookup_table_);
      kh_destroy(k64_seq, barcode_whitelist_lookup_table_);
    }
    if (barcode_histogram_ != NULL) {
      kh_destroy(k32, barcode_histogram_);
      kh_destroy(k64_seq, barcode_histogram_);
    }
    if (barcode_index_table_ != NULL) {
      kh_destroy(k32, barcode_index_table_);
      kh_destroy(k64_seq, barcode_index_table_);
    }
    if (barcode_lookup_table_ != NULL) {
      kh_destroy(k32, barcode_lookup_table_);
      kh_destroy(k64_seq, barcode_lookup_table_);
    }
    if (read_lookup_tables_.size() > 0) {
      for (uint32_t i = 0; i < read_lookup_tables_.size(); ++i) {
@@ -129,9 +193,9 @@ class Chromap {
  void RecalibrateBestMappingsForPairedEndReadOnOneDirection(Direction first_read_direction, uint32_t pair_index, int min_sum_errors, int second_min_sum_errors, int min_num_errors1, int num_best_mappings1, int second_min_num_errors1, int num_second_best_mappings1, const SequenceBatch &read_batch1, const std::vector<std::pair<int, uint64_t> > &mappings1, int min_num_errors2, int num_best_mappings2, int second_min_num_errors2, int num_second_best_mappings2, const SequenceBatch &read_batch2, const SequenceBatch &reference, const std::vector<std::pair<int, uint64_t> > &mappings2, const std::vector<std::pair<uint32_t, uint32_t> > &edit_best_mappings, std::vector<std::pair<uint32_t, uint32_t> > *best_mappings, int *best_alignment_score, int *num_best_mappings, int *second_best_alignment_score, int *num_second_best_mappings);
  void ProcessBestMappingsForPairedEndReadOnOneDirection(Direction first_read_direction, Direction second_read_direction, uint32_t pair_index, uint8_t mapq, int num_candidates1, uint32_t repetitive_seed_length1, int min_num_errors1, int num_best_mappings1, int second_min_num_errors1, int num_second_best_mappings1, const SequenceBatch &read_batch1, const std::vector<std::pair<int, uint64_t> > &mappings1, const std::vector<int> &split_sites1, int num_candidates2, uint32_t repetitive_seed_length2, int min_num_errors2, int num_best_mappings2, int second_min_num_errors2, int num_second_best_mappings2, const SequenceBatch &read_batch2, const SequenceBatch &reference, const SequenceBatch &barcode_batch, const std::vector<int> &best_mapping_indices, const std::vector<std::pair<int, uint64_t> > &mappings2, const std::vector<int> &split_sites2, const std::vector<std::pair<uint32_t, uint32_t> > &best_mappings, int min_sum_errors, int num_best_mappings, int second_min_sum_errors, int num_second_best_mappings, int *best_mapping_index, int *num_best_mappings_reported, int force_mapq, std::vector<std::vector<MappingRecord> > *mappings_on_diff_ref_seqs);
  void GenerateBestMappingsForPairedEndRead(uint32_t pair_index, int num_positive_candidates1, int num_negative_candidates1, uint32_t repetitive_seed_length1, int min_num_errors1, int num_best_mappings1, int second_min_num_errors1, int num_second_best_mappings1, const SequenceBatch &read_batch1, const std::vector<std::pair<int, uint64_t> > &positive_mappings1, const std::vector<int> &positive_split_sites1, const std::vector<std::pair<int, uint64_t> > &negative_mappings1, const std::vector<int> &negative_split_sites1, int num_positive_candidates2, int num_negative_candidates2, uint32_t repetitive_seed_length2, int min_num_errors2, int num_best_mappings2, int second_min_num_errors2, int num_second_best_mappings2, const SequenceBatch &read_batch2, const SequenceBatch &reference, const SequenceBatch &barcode_batch, const std::vector<std::pair<int, uint64_t> > &positive_mappings2, const std::vector<int> &positive_split_sites2, const std::vector<std::pair<int, uint64_t> > &negative_mappings2, const std::vector<int> &negative_split_sites2, std::vector<int> *best_mapping_indices, std::mt19937 *generator, std::vector<std::pair<uint32_t, uint32_t> > *F1R2_best_mappings, std::vector<std::pair<uint32_t, uint32_t> > *F2R1_best_mappings, std::vector<std::pair<uint32_t, uint32_t> > *F1F2_best_mappings, std::vector<std::pair<uint32_t, uint32_t> > *R1R2_best_mappings, int *min_sum_errors, int *num_best_mappings, int *second_min_sum_errors, int *num_second_best_mappings, int force_mapq, std::vector<std::vector<MappingRecord> > *mappings_on_diff_ref_seqs);
  void EmplaceBackMappingRecord(uint32_t read_id, uint32_t barcode, uint32_t fragment_start_position, uint16_t fragment_length, uint8_t mapq, uint8_t direction, uint8_t is_unique, uint8_t num_dups, uint16_t positive_alignment_length, uint16_t negative_alignment_length, std::vector<MappingRecord> *mappings_on_diff_ref_seqs);
  void EmplaceBackMappingRecord(uint32_t read_id, const char *read1_name, const char *read2_name, uint16_t read1_length, uint16_t read2_length, uint32_t barcode, uint32_t fragment_start_position, uint16_t fragment_length, uint8_t mapq1, uint8_t mapq2, uint8_t direction, uint8_t is_unique, uint8_t num_dups, uint16_t positive_alignment_length, uint16_t negative_alignment_length, std::vector<MappingRecord> *mappings_on_diff_ref_seqs);
  void EmplaceBackMappingRecord(uint32_t read_id, const char *read_name, uint32_t cell_barcode, int rid1, int rid2, uint32_t pos1, uint32_t pos2, int direction1, int direction2, uint8_t mapq, uint8_t is_unique, uint8_t num_dups, std::vector<MappingRecord> *mappings_on_diff_ref_seqs);
  void EmplaceBackMappingRecord(uint32_t read_id, uint64_t barcode, uint32_t fragment_start_position, uint16_t fragment_length, uint8_t mapq, uint8_t direction, uint8_t is_unique, uint8_t num_dups, uint16_t positive_alignment_length, uint16_t negative_alignment_length, std::vector<MappingRecord> *mappings_on_diff_ref_seqs);
  void EmplaceBackMappingRecord(uint32_t read_id, const char *read1_name, const char *read2_name, uint16_t read1_length, uint16_t read2_length, uint64_t barcode, uint32_t fragment_start_position, uint16_t fragment_length, uint8_t mapq1, uint8_t mapq2, uint8_t direction, uint8_t is_unique, uint8_t num_dups, uint16_t positive_alignment_length, uint16_t negative_alignment_length, std::vector<MappingRecord> *mappings_on_diff_ref_seqs);
  void EmplaceBackMappingRecord(uint32_t read_id, const char *read_name, uint64_t cell_barcode, int rid1, int rid2, uint32_t pos1, uint32_t pos2, int direction1, int direction2, uint8_t mapq, uint8_t is_unique, uint8_t num_dups, std::vector<MappingRecord> *mappings_on_diff_ref_seqs);
  void ApplyTn5ShiftOnPairedEndMapping(uint32_t num_reference_sequences, std::vector<std::vector<MappingRecord> > *mappings);

  // For single-end read mapping
@@ -253,6 +317,9 @@ class Chromap {
  std::vector<std::string> read_file2_paths_;
  std::vector<std::string> barcode_file_paths_;
  std::string barcode_whitelist_file_path_;
  int barcode_format_[2] ; // 0-start,1-end(includsive)
  int read1_format_[2] ; 
  int read2_format_[2] ;
  std::string mapping_output_file_path_;
  FILE *mapping_output_file_;
  std::string matrix_output_prefix_;
@@ -261,10 +328,10 @@ class Chromap {
	std::vector<int> custom_rid_rank_;
	std::vector<int> pairs_custom_rid_rank_;
  //khash_t(k32_set)* barcode_whitelist_lookup_table_;
  khash_t(k32)* barcode_whitelist_lookup_table_;
  khash_t(k64_seq)* barcode_whitelist_lookup_table_;
  // For identical read dedupe
  int allocated_barcode_lookup_table_size_ = (1 << 10);
  khash_t(k32)* barcode_lookup_table_;
  khash_t(k64_seq)* barcode_lookup_table_;
  std::vector<khash_t(k128)* > read_lookup_tables_;
  // For mapping
  int min_unique_mapping_mapq_ = 4;
@@ -289,8 +356,8 @@ class Chromap {
  uint64_t num_barcode_in_whitelist_ = 0;
  uint64_t num_corrected_barcode_ = 0;
  uint32_t barcode_length_ = 0;
  khash_t(k32)* barcode_histogram_;
  khash_t(k32)* barcode_index_table_;
  khash_t(k64_seq)* barcode_histogram_;
  khash_t(k64_seq)* barcode_index_table_;
  // For peak calling
  std::vector<std::vector<uint16_t> > pileup_on_diff_ref_seqs_;
  std::vector<std::vector<Peak> > peaks_on_diff_ref_seqs_;
+12 −12
Original line number Diff line number Diff line
@@ -298,7 +298,7 @@ struct PairedPAFMapping {
struct SAMMapping {
  uint32_t read_id;
  std::string read_name;
  uint32_t cell_barcode;
  uint64_t cell_barcode;
  //uint16_t read_length;
  //uint32_t fragment_start_position;
  //uint16_t fragment_length;
@@ -329,7 +329,7 @@ struct SAMMapping {
  bool HasSamePosition(const SAMMapping& m) const {
    return std::tie(pos, rid, is_rev) == std::tie(m.pos, m.rid, m.is_rev);
  }
  uint32_t GetBarcode() const {
  uint64_t GetBarcode() const {
    return cell_barcode;
  }
  void Tn5Shift() {
@@ -396,7 +396,7 @@ struct SAMMapping {
    uint16_t read_name_length = read_name.length();
    num_written_bytes += fwrite(&read_name_length, sizeof(uint16_t), 1, temp_mapping_output_file);
    num_written_bytes += fwrite(read_name.data(), sizeof(char), read_name_length, temp_mapping_output_file);
    num_written_bytes += fwrite(&cell_barcode, sizeof(uint32_t), 1, temp_mapping_output_file);
    num_written_bytes += fwrite(&cell_barcode, sizeof(uint64_t), 1, temp_mapping_output_file);
    num_written_bytes += fwrite(&num_dups, sizeof(uint8_t), 1, temp_mapping_output_file);
    num_written_bytes += fwrite(&pos, sizeof(int64_t), 1, temp_mapping_output_file);
    num_written_bytes += fwrite(&rid, sizeof(int), 1, temp_mapping_output_file);
@@ -426,7 +426,7 @@ struct SAMMapping {
    num_read_bytes += fread(&read_name_length, sizeof(uint16_t), 1, temp_mapping_output_file);
    read_name = std::string(read_name_length, '\0');
    num_read_bytes += fread(&(read_name[0]), sizeof(char), read_name_length, temp_mapping_output_file);
    num_read_bytes += fread(&cell_barcode, sizeof(uint32_t), 1, temp_mapping_output_file);
    num_read_bytes += fread(&cell_barcode, sizeof(uint64_t), 1, temp_mapping_output_file);
    num_read_bytes += fread(&num_dups, sizeof(uint8_t), 1, temp_mapping_output_file);
    num_read_bytes += fread(&pos, sizeof(int64_t), 1, temp_mapping_output_file);
    num_read_bytes += fread(&rid, sizeof(int), 1, temp_mapping_output_file);
@@ -465,7 +465,7 @@ struct SAMMapping {
struct PairsMapping {  
  uint32_t read_id;
  std::string read_name;
  uint32_t cell_barcode;
  uint64_t cell_barcode;
  int rid1;
  int rid2;
  uint32_t pos1; 
@@ -524,7 +524,7 @@ struct PairsMapping {
    uint16_t read_name_length = read_name.length();
    num_written_bytes += fwrite(&read_name_length, sizeof(uint16_t), 1, temp_mapping_output_file);
    num_written_bytes += fwrite(read_name.data(), sizeof(char), read_name_length, temp_mapping_output_file);
    num_written_bytes += fwrite(&cell_barcode, sizeof(uint32_t), 1, temp_mapping_output_file);
    num_written_bytes += fwrite(&cell_barcode, sizeof(uint64_t), 1, temp_mapping_output_file);
    num_written_bytes += fwrite(&rid1, sizeof(int), 1, temp_mapping_output_file);
    num_written_bytes += fwrite(&rid2, sizeof(int), 1, temp_mapping_output_file);
    num_written_bytes += fwrite(&pos1, sizeof(uint32_t), 1, temp_mapping_output_file);
@@ -542,7 +542,7 @@ struct PairsMapping {
    num_read_bytes += fread(&read_name_length, sizeof(uint16_t), 1, temp_mapping_output_file);
    read_name = std::string(read_name_length, '\0');
    num_read_bytes += fread(&(read_name[0]), sizeof(char), read_name_length, temp_mapping_output_file);
    num_read_bytes += fread(&cell_barcode, sizeof(uint32_t), 1, temp_mapping_output_file);
    num_read_bytes += fread(&cell_barcode, sizeof(uint64_t), 1, temp_mapping_output_file);
    num_read_bytes += fread(&rid1, sizeof(int), 1, temp_mapping_output_file);
    num_read_bytes += fread(&rid2, sizeof(int), 1, temp_mapping_output_file);
    num_read_bytes += fread(&pos1, sizeof(uint32_t), 1, temp_mapping_output_file);
@@ -560,7 +560,7 @@ struct PairsMapping {

struct MappingWithBarcode {
  uint32_t read_id;
  uint32_t cell_barcode;
  uint64_t cell_barcode;
  uint32_t fragment_start_position;
  uint16_t fragment_length;
  uint8_t mapq : 6, direction : 1, is_unique : 1;
@@ -575,7 +575,7 @@ struct MappingWithBarcode {
  bool HasSamePosition(const MappingWithBarcode& m) const {
    return std::tie(fragment_start_position) == std::tie(m.fragment_start_position);
  }
  uint32_t GetBarcode() const {
  uint64_t GetBarcode() const {
    return cell_barcode;
  }
  void Tn5Shift() {
@@ -635,7 +635,7 @@ struct MappingWithoutBarcode {

struct PairedEndMappingWithBarcode {
  uint32_t read_id;
  uint32_t cell_barcode;
  uint64_t cell_barcode;
  uint32_t fragment_start_position;
  uint16_t fragment_length;
  uint8_t mapq : 6, direction : 1, is_unique : 1;
@@ -652,7 +652,7 @@ struct PairedEndMappingWithBarcode {
  bool HasSamePosition(const PairedEndMappingWithBarcode& m) const {
    return std::tie(fragment_start_position, fragment_length) == std::tie(m.fragment_start_position, m.fragment_length);
  }
  uint32_t GetBarcode() const {
  uint64_t GetBarcode() const {
    return cell_barcode;
  }
  void Tn5Shift() {
@@ -1016,7 +1016,7 @@ class OutputTools {
    const char *sequence_name = reference.GetSequenceNameAt(rid);
    fprintf(peak_output_file_, "%s\t%u\t%u\n", sequence_name, peak_start_position + 1, peak_start_position + peak_length);
  }
  void AppendBarcodeOutput(uint32_t barcode_key) {
  void AppendBarcodeOutput(uint64_t barcode_key) {
    fprintf(barcode_output_file_, "%s-1\n", Seed2Sequence(barcode_key, cell_barcode_length_).data());
  }
  void WriteMatrixOutputHead(uint64_t num_peaks, uint64_t num_barcodes, uint64_t num_lines) {
+6 −0
Original line number Diff line number Diff line
@@ -29,10 +29,12 @@ uint32_t SequenceBatch::LoadBatch() {
    if (length > 0) {
      kseq_t *sequence = sequence_batch_[sequence_index];
      std::swap(sequence_kseq_->seq, sequence->seq);
      ReplaceByEffectiveRange(sequence->seq);
      std::swap(sequence_kseq_->name, sequence->name);
      std::swap(sequence_kseq_->comment, sequence->comment);
      if (sequence_kseq_->qual.l != 0) { // fastq file
        std::swap(sequence_kseq_->qual, sequence->qual);
        ReplaceByEffectiveRange(sequence->qual);
      }
      sequence->id = num_loaded_sequences_;
      ++num_loaded_sequences_;
@@ -66,12 +68,14 @@ bool SequenceBatch::LoadOneSequenceAndSaveAt(uint32_t sequence_index) {
  if (length > 0) {
    kseq_t *sequence = sequence_batch_[sequence_index];
    std::swap(sequence_kseq_->seq, sequence->seq);
    ReplaceByEffectiveRange(sequence->seq);
    std::swap(sequence_kseq_->name, sequence->name);
    std::swap(sequence_kseq_->comment, sequence->comment);
    sequence->id = num_loaded_sequences_;
    ++num_loaded_sequences_;
    if (sequence_kseq_->qual.l != 0) { // fastq file
      std::swap(sequence_kseq_->qual, sequence->qual);
        ReplaceByEffectiveRange(sequence->qual);
    } 
  } else {
    if (length != -1) {
@@ -96,10 +100,12 @@ uint32_t SequenceBatch::LoadAllSequences() {
      sequence_batch_.emplace_back((kseq_t*)calloc(1, sizeof(kseq_t)));
      kseq_t *sequence = sequence_batch_.back();
      std::swap(sequence_kseq_->seq, sequence->seq);
      ReplaceByEffectiveRange(sequence->seq);
      std::swap(sequence_kseq_->name, sequence->name);
      std::swap(sequence_kseq_->comment, sequence->comment);
      if (sequence_kseq_->qual.l != 0) { // fastq file
        std::swap(sequence_kseq_->qual, sequence->qual);
        ReplaceByEffectiveRange(sequence->qual);
      }
      sequence->id = num_loaded_sequences_;
      ++num_loaded_sequences_;
Loading