Unverified Commit 85817863 authored by Haowen Zhang's avatar Haowen Zhang Committed by GitHub
Browse files

Merge pull request #1 from haowenz/sam-output

Generate SAM with ksw for single end read mapping
parents 3ea11d4f 712b48a8
Loading
Loading
Loading
Loading
+197 −142

File changed.

Preview size limit exceeded, changes collapsed.

+4 −1
Original line number Diff line number Diff line
@@ -80,7 +80,7 @@ class Chromap {
  }

  // For mapping
  Chromap(int error_threshold, int match_score, int mismatch_penalty, const std::vector<int> &gap_open_penalties, const std::vector<int> &gap_extension_penalties, int min_num_seeds_required_for_mapping, const std::vector<int> &max_seed_frequencies, int max_num_best_mappings, int max_insert_size, uint8_t mapq_threshold, int num_threads, int min_read_length, int multi_mapping_allocation_distance, int multi_mapping_allocation_seed, int drop_repetitive_reads, bool trim_adapters, bool remove_pcr_duplicates, bool is_bulk_data, bool allocate_multi_mappings, bool only_output_unique_mappings, bool Tn5_shift, bool output_mapping_in_BED, bool output_mapping_in_TagAlign, bool output_mapping_in_PAF, bool low_memory_mode, bool cell_by_bin, int bin_size, uint16_t depth_cutoff_to_call_peak, int peak_min_length, int peak_merge_max_length, const std::string &reference_file_path, const std::string &index_file_path, const std::vector<std::string> &read_file1_paths, const std::vector<std::string> &read_file2_paths, const std::vector<std::string> &barcode_file_paths, const std::string &barcode_whitelist_file_path, const std::string &mapping_output_file_path, const std::string &matrix_output_prefix) : error_threshold_(error_threshold), match_score_(match_score), mismatch_penalty_(mismatch_penalty), gap_open_penalties_(gap_open_penalties), gap_extension_penalties_(gap_extension_penalties), min_num_seeds_required_for_mapping_(min_num_seeds_required_for_mapping), max_seed_frequencies_(max_seed_frequencies), max_num_best_mappings_(max_num_best_mappings), max_insert_size_(max_insert_size), mapq_threshold_(mapq_threshold), num_threads_(num_threads), min_read_length_(min_read_length), multi_mapping_allocation_distance_(multi_mapping_allocation_distance), multi_mapping_allocation_seed_(multi_mapping_allocation_seed), drop_repetitive_reads_(drop_repetitive_reads), trim_adapters_(trim_adapters), remove_pcr_duplicates_(remove_pcr_duplicates), is_bulk_data_(is_bulk_data), allocate_multi_mappings_(allocate_multi_mappings), only_output_unique_mappings_(only_output_unique_mappings), Tn5_shift_(Tn5_shift), output_mapping_in_BED_(output_mapping_in_BED), output_mapping_in_TagAlign_(output_mapping_in_TagAlign), output_mapping_in_PAF_(output_mapping_in_PAF), low_memory_mode_(low_memory_mode), cell_by_bin_(cell_by_bin), bin_size_(bin_size), depth_cutoff_to_call_peak_(depth_cutoff_to_call_peak), peak_min_length_(peak_min_length), peak_merge_max_length_(peak_merge_max_length), reference_file_path_(reference_file_path), index_file_path_(index_file_path), read_file1_paths_(read_file1_paths), read_file2_paths_(read_file2_paths), barcode_file_paths_(barcode_file_paths), barcode_whitelist_file_path_(barcode_whitelist_file_path), mapping_output_file_path_(mapping_output_file_path), matrix_output_prefix_(matrix_output_prefix) {
  Chromap(int error_threshold, int match_score, int mismatch_penalty, const std::vector<int> &gap_open_penalties, const std::vector<int> &gap_extension_penalties, int min_num_seeds_required_for_mapping, const std::vector<int> &max_seed_frequencies, int max_num_best_mappings, int max_insert_size, uint8_t mapq_threshold, int num_threads, int min_read_length, int multi_mapping_allocation_distance, int multi_mapping_allocation_seed, int drop_repetitive_reads, bool trim_adapters, bool remove_pcr_duplicates, bool is_bulk_data, bool allocate_multi_mappings, bool only_output_unique_mappings, bool Tn5_shift, bool output_mapping_in_BED, bool output_mapping_in_TagAlign, bool output_mapping_in_PAF, bool output_mapping_in_SAM, bool low_memory_mode, bool cell_by_bin, int bin_size, uint16_t depth_cutoff_to_call_peak, int peak_min_length, int peak_merge_max_length, const std::string &reference_file_path, const std::string &index_file_path, const std::vector<std::string> &read_file1_paths, const std::vector<std::string> &read_file2_paths, const std::vector<std::string> &barcode_file_paths, const std::string &barcode_whitelist_file_path, const std::string &mapping_output_file_path, const std::string &matrix_output_prefix) : error_threshold_(error_threshold), match_score_(match_score), mismatch_penalty_(mismatch_penalty), gap_open_penalties_(gap_open_penalties), gap_extension_penalties_(gap_extension_penalties), min_num_seeds_required_for_mapping_(min_num_seeds_required_for_mapping), max_seed_frequencies_(max_seed_frequencies), max_num_best_mappings_(max_num_best_mappings), max_insert_size_(max_insert_size), mapq_threshold_(mapq_threshold), num_threads_(num_threads), min_read_length_(min_read_length), multi_mapping_allocation_distance_(multi_mapping_allocation_distance), multi_mapping_allocation_seed_(multi_mapping_allocation_seed), drop_repetitive_reads_(drop_repetitive_reads), trim_adapters_(trim_adapters), remove_pcr_duplicates_(remove_pcr_duplicates), is_bulk_data_(is_bulk_data), allocate_multi_mappings_(allocate_multi_mappings), only_output_unique_mappings_(only_output_unique_mappings), Tn5_shift_(Tn5_shift), output_mapping_in_BED_(output_mapping_in_BED), output_mapping_in_TagAlign_(output_mapping_in_TagAlign), output_mapping_in_PAF_(output_mapping_in_PAF), output_mapping_in_SAM_(output_mapping_in_SAM), low_memory_mode_(low_memory_mode), cell_by_bin_(cell_by_bin), bin_size_(bin_size), depth_cutoff_to_call_peak_(depth_cutoff_to_call_peak), peak_min_length_(peak_min_length), peak_merge_max_length_(peak_merge_max_length), reference_file_path_(reference_file_path), index_file_path_(index_file_path), read_file1_paths_(read_file1_paths), read_file2_paths_(read_file2_paths), barcode_file_paths_(barcode_file_paths), barcode_whitelist_file_path_(barcode_whitelist_file_path), mapping_output_file_path_(mapping_output_file_path), matrix_output_prefix_(matrix_output_prefix) {
    barcode_lookup_table_ = kh_init(k32);
    barcode_whitelist_lookup_table_ = kh_init(k32);
    barcode_histogram_ = kh_init(k32);
@@ -137,6 +137,7 @@ class Chromap {
  void ProcessBestMappingsForSingleEndRead(Direction mapping_direction, uint8_t mapq, int min_num_errors, int num_best_mappings, int second_min_num_errors, int num_second_best_mappings, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const SequenceBatch &barcode_batch, const std::vector<int> &best_mapping_indices, const std::vector<std::pair<int, uint64_t> > &mappings, int *best_mapping_index, int *num_best_mappings_reported, std::vector<std::vector<MappingRecord> > *mappings_on_diff_ref_seqs);
  void EmplaceBackMappingRecord(uint32_t read_id, uint32_t barcode, uint32_t fragment_start_position, uint16_t fragment_length, uint8_t mapq, uint8_t direction, uint8_t is_unique, uint8_t num_dups, std::vector<MappingRecord> *mappings_on_diff_ref_seqs);
  void EmplaceBackMappingRecord(uint32_t read_id, const char* read_name, uint16_t read_length, uint32_t barcode, uint32_t fragment_start_position, uint16_t fragment_length, uint8_t mapq, uint8_t direction, uint8_t is_unique, uint8_t num_dups, std::vector<MappingRecord> *mappings_on_diff_ref_seqs);
  void EmplaceBackMappingRecord(uint32_t read_id, const char *read_name, uint8_t num_dups, int64_t position, int rid, int flag, uint8_t direction, uint8_t is_unique, uint8_t mapq, uint32_t NM, int n_cigar, uint32_t *cigar, std::vector<MappingRecord> *mappings_on_diff_ref_seqs);
  void ApplyTn5ShiftOnSingleEndMapping(uint32_t num_reference_sequences, std::vector<std::vector<MappingRecord> > *mappings);
  uint32_t LoadSingleEndReadsWithBarcodes(SequenceBatch *read_batch, SequenceBatch *barcode_batch);

@@ -148,6 +149,7 @@ class Chromap {
  void BandedTraceback(int min_num_errors, const char *pattern, const char *text, const int read_length, int *mapping_start_position);
  void MergeCandidates(std::vector<Candidate> &c1, std::vector<Candidate> &c2, std::vector<Candidate> &buffer);
  void SupplementCandidates(const Index &index, uint32_t repetitive_seed_length1, uint32_t repetitive_seed_length2, std::vector<std::pair<uint64_t, uint64_t> > &minimizers1, std::vector<std::pair<uint64_t, uint64_t> > &minimizers2, std::vector<uint64_t> &positive_hits1, std::vector<uint64_t> &positive_hits2, std::vector<Candidate> &positive_candidates1, std::vector<Candidate> &positive_candidates2, std::vector<Candidate> &positive_candidates1_buffer, std::vector<Candidate> &positive_candidates2_buffer, std::vector<uint64_t> &negative_hits1, std::vector<uint64_t> &negative_hits2, std::vector<Candidate> &negative_candidates1, std::vector<Candidate> &negative_candidates2, std::vector<Candidate> &negative_candidates1_buffer, std::vector<Candidate> &negative_candidates2_buffer);
  void PostProcessingInLowMemory(uint32_t num_mappings_in_mem, uint32_t num_reference_sequences, const SequenceBatch &reference);
  void VerifyCandidatesOnOneDirectionUsingSIMD(Direction candidate_direction, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<Candidate> &candidates, std::vector<std::pair<int, uint64_t> > *mappings, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void VerifyCandidatesOnOneDirection(Direction candidate_direction, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<Candidate> &candidates, std::vector<std::pair<int, uint64_t> > *mappings, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void VerifyCandidates(const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, const std::vector<Candidate> &positive_candidates, const std::vector<Candidate> &negative_candidates, std::vector<std::pair<int, uint64_t> > *positive_mappings, std::vector<std::pair<int, uint64_t> > *negative_mappings, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
@@ -218,6 +220,7 @@ class Chromap {
  bool output_mapping_in_BED_;
  bool output_mapping_in_TagAlign_;
  bool output_mapping_in_PAF_;
  bool output_mapping_in_SAM_;
  uint32_t read_batch_size_ = 500000; // default batch size, # reads for single-end reads, # read pairs for paired-end reads
  bool low_memory_mode_;
  bool cell_by_bin_;
+11 −4
Original line number Diff line number Diff line
@@ -590,21 +590,28 @@ int ksw_semi_global2(int qlen, const char *query, int tlen, const char *target,
	}
	//score = eh[qlen].h;
	score = eh[qlen].h;

  int max_score_position = qlen;
  for (j = 1; j < w; ++j) {
    score = eh[qlen - j].h > score ? eh[qlen - j].h : score;
    if (eh[qlen - j].h > score) {
      score = eh[qlen - j].h;
      max_score_position = qlen - j;
    }
  }
	if (n_cigar_ && cigar_) { // backtrack
		int n_cigar = 0, m_cigar = 0, which = 0;
		uint32_t *cigar = 0, tmp;
		i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell
		//i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell
		i = tlen - 1; k = max_score_position - 1; // (i,k) points to the last cell
		while (i >= 0 && k >= 0) {
			which = z[(long)i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3;
			//which = z[(long)i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3;
			which = z[(long)i * n_col + (k - i)] >> (which<<1) & 3;
			if (which == 0)      cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k;
			else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i;
			else                 cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k;
		}
		if (i >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1);
		if (k >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1);
		//if (k >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1);
		for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
			tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;
		*n_cigar_ = n_cigar, *cigar_ = cigar;
+395 −75

File changed.

Preview size limit exceeded, changes collapsed.