Commit 84bba39b authored by Haowen Zhang's avatar Haowen Zhang
Browse files

A painful implementation of split alignment with ed

parent 85817863
Loading
Loading
Loading
Loading
+243 −60

File changed.

Preview size limit exceeded, changes collapsed.

+8 −5
Original line number Diff line number Diff line
@@ -80,7 +80,7 @@ class Chromap {
  }

  // For mapping
  Chromap(int error_threshold, int match_score, int mismatch_penalty, const std::vector<int> &gap_open_penalties, const std::vector<int> &gap_extension_penalties, int min_num_seeds_required_for_mapping, const std::vector<int> &max_seed_frequencies, int max_num_best_mappings, int max_insert_size, uint8_t mapq_threshold, int num_threads, int min_read_length, int multi_mapping_allocation_distance, int multi_mapping_allocation_seed, int drop_repetitive_reads, bool trim_adapters, bool remove_pcr_duplicates, bool is_bulk_data, bool allocate_multi_mappings, bool only_output_unique_mappings, bool Tn5_shift, bool output_mapping_in_BED, bool output_mapping_in_TagAlign, bool output_mapping_in_PAF, bool output_mapping_in_SAM, bool low_memory_mode, bool cell_by_bin, int bin_size, uint16_t depth_cutoff_to_call_peak, int peak_min_length, int peak_merge_max_length, const std::string &reference_file_path, const std::string &index_file_path, const std::vector<std::string> &read_file1_paths, const std::vector<std::string> &read_file2_paths, const std::vector<std::string> &barcode_file_paths, const std::string &barcode_whitelist_file_path, const std::string &mapping_output_file_path, const std::string &matrix_output_prefix) : error_threshold_(error_threshold), match_score_(match_score), mismatch_penalty_(mismatch_penalty), gap_open_penalties_(gap_open_penalties), gap_extension_penalties_(gap_extension_penalties), min_num_seeds_required_for_mapping_(min_num_seeds_required_for_mapping), max_seed_frequencies_(max_seed_frequencies), max_num_best_mappings_(max_num_best_mappings), max_insert_size_(max_insert_size), mapq_threshold_(mapq_threshold), num_threads_(num_threads), min_read_length_(min_read_length), multi_mapping_allocation_distance_(multi_mapping_allocation_distance), multi_mapping_allocation_seed_(multi_mapping_allocation_seed), drop_repetitive_reads_(drop_repetitive_reads), trim_adapters_(trim_adapters), remove_pcr_duplicates_(remove_pcr_duplicates), is_bulk_data_(is_bulk_data), allocate_multi_mappings_(allocate_multi_mappings), only_output_unique_mappings_(only_output_unique_mappings), Tn5_shift_(Tn5_shift), output_mapping_in_BED_(output_mapping_in_BED), output_mapping_in_TagAlign_(output_mapping_in_TagAlign), output_mapping_in_PAF_(output_mapping_in_PAF), output_mapping_in_SAM_(output_mapping_in_SAM), low_memory_mode_(low_memory_mode), cell_by_bin_(cell_by_bin), bin_size_(bin_size), depth_cutoff_to_call_peak_(depth_cutoff_to_call_peak), peak_min_length_(peak_min_length), peak_merge_max_length_(peak_merge_max_length), reference_file_path_(reference_file_path), index_file_path_(index_file_path), read_file1_paths_(read_file1_paths), read_file2_paths_(read_file2_paths), barcode_file_paths_(barcode_file_paths), barcode_whitelist_file_path_(barcode_whitelist_file_path), mapping_output_file_path_(mapping_output_file_path), matrix_output_prefix_(matrix_output_prefix) {
  Chromap(int error_threshold, int match_score, int mismatch_penalty, const std::vector<int> &gap_open_penalties, const std::vector<int> &gap_extension_penalties, int min_num_seeds_required_for_mapping, const std::vector<int> &max_seed_frequencies, int max_num_best_mappings, int max_insert_size, uint8_t mapq_threshold, int num_threads, int min_read_length, int multi_mapping_allocation_distance, int multi_mapping_allocation_seed, int drop_repetitive_reads, bool trim_adapters, bool remove_pcr_duplicates, bool is_bulk_data, bool allocate_multi_mappings, bool only_output_unique_mappings, bool Tn5_shift, bool split_alignment, bool output_mapping_in_BED, bool output_mapping_in_TagAlign, bool output_mapping_in_PAF, bool output_mapping_in_SAM, bool low_memory_mode, bool cell_by_bin, int bin_size, uint16_t depth_cutoff_to_call_peak, int peak_min_length, int peak_merge_max_length, const std::string &reference_file_path, const std::string &index_file_path, const std::vector<std::string> &read_file1_paths, const std::vector<std::string> &read_file2_paths, const std::vector<std::string> &barcode_file_paths, const std::string &barcode_whitelist_file_path, const std::string &mapping_output_file_path, const std::string &matrix_output_prefix) : error_threshold_(error_threshold), match_score_(match_score), mismatch_penalty_(mismatch_penalty), gap_open_penalties_(gap_open_penalties), gap_extension_penalties_(gap_extension_penalties), min_num_seeds_required_for_mapping_(min_num_seeds_required_for_mapping), max_seed_frequencies_(max_seed_frequencies), max_num_best_mappings_(max_num_best_mappings), max_insert_size_(max_insert_size), mapq_threshold_(mapq_threshold), num_threads_(num_threads), min_read_length_(min_read_length), multi_mapping_allocation_distance_(multi_mapping_allocation_distance), multi_mapping_allocation_seed_(multi_mapping_allocation_seed), drop_repetitive_reads_(drop_repetitive_reads), trim_adapters_(trim_adapters), remove_pcr_duplicates_(remove_pcr_duplicates), is_bulk_data_(is_bulk_data), allocate_multi_mappings_(allocate_multi_mappings), only_output_unique_mappings_(only_output_unique_mappings), Tn5_shift_(Tn5_shift), split_alignment_(split_alignment), output_mapping_in_BED_(output_mapping_in_BED), output_mapping_in_TagAlign_(output_mapping_in_TagAlign), output_mapping_in_PAF_(output_mapping_in_PAF), output_mapping_in_SAM_(output_mapping_in_SAM), low_memory_mode_(low_memory_mode), cell_by_bin_(cell_by_bin), bin_size_(bin_size), depth_cutoff_to_call_peak_(depth_cutoff_to_call_peak), peak_min_length_(peak_min_length), peak_merge_max_length_(peak_merge_max_length), reference_file_path_(reference_file_path), index_file_path_(index_file_path), read_file1_paths_(read_file1_paths), read_file2_paths_(read_file2_paths), barcode_file_paths_(barcode_file_paths), barcode_whitelist_file_path_(barcode_whitelist_file_path), mapping_output_file_path_(mapping_output_file_path), matrix_output_prefix_(matrix_output_prefix) {
    barcode_lookup_table_ = kh_init(k32);
    barcode_whitelist_lookup_table_ = kh_init(k32);
    barcode_histogram_ = kh_init(k32);
@@ -133,8 +133,8 @@ class Chromap {

  // For single-end read mapping
  void MapSingleEndReads();
  void GenerateBestMappingsForSingleEndRead(int min_num_errors, int num_best_mappings, int second_min_num_errors, int num_second_best_mappings, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const SequenceBatch &barcode_batch, const std::vector<std::pair<int, uint64_t> > &positive_mappings, const std::vector<std::pair<int, uint64_t> > &negative_mappings, std::vector<std::vector<MappingRecord> > *mappings_on_diff_ref_seqs);
  void ProcessBestMappingsForSingleEndRead(Direction mapping_direction, uint8_t mapq, int min_num_errors, int num_best_mappings, int second_min_num_errors, int num_second_best_mappings, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const SequenceBatch &barcode_batch, const std::vector<int> &best_mapping_indices, const std::vector<std::pair<int, uint64_t> > &mappings, int *best_mapping_index, int *num_best_mappings_reported, std::vector<std::vector<MappingRecord> > *mappings_on_diff_ref_seqs);
  void GenerateBestMappingsForSingleEndRead(int min_num_errors, int num_best_mappings, int second_min_num_errors, int num_second_best_mappings, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const SequenceBatch &barcode_batch, const std::vector<std::pair<int, uint64_t> > &positive_mappings, const std::vector<int> &positive_split_sites, const std::vector<std::pair<int, uint64_t> > &negative_mappings, const std::vector<int> &negative_split_sites, std::vector<std::vector<MappingRecord> > *mappings_on_diff_ref_seqs);
  void ProcessBestMappingsForSingleEndRead(Direction mapping_direction, uint8_t mapq, int min_num_errors, int num_best_mappings, int second_min_num_errors, int num_second_best_mappings, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const SequenceBatch &barcode_batch, const std::vector<int> &best_mapping_indices, const std::vector<std::pair<int, uint64_t> > &mappings, const std::vector<int> &split_sites, int *best_mapping_index, int *num_best_mappings_reported, std::vector<std::vector<MappingRecord> > *mappings_on_diff_ref_seqs);
  void EmplaceBackMappingRecord(uint32_t read_id, uint32_t barcode, uint32_t fragment_start_position, uint16_t fragment_length, uint8_t mapq, uint8_t direction, uint8_t is_unique, uint8_t num_dups, std::vector<MappingRecord> *mappings_on_diff_ref_seqs);
  void EmplaceBackMappingRecord(uint32_t read_id, const char* read_name, uint16_t read_length, uint32_t barcode, uint32_t fragment_start_position, uint16_t fragment_length, uint8_t mapq, uint8_t direction, uint8_t is_unique, uint8_t num_dups, std::vector<MappingRecord> *mappings_on_diff_ref_seqs);
  void EmplaceBackMappingRecord(uint32_t read_id, const char *read_name, uint8_t num_dups, int64_t position, int rid, int flag, uint8_t direction, uint8_t is_unique, uint8_t mapq, uint32_t NM, int n_cigar, uint32_t *cigar, std::vector<MappingRecord> *mappings_on_diff_ref_seqs);
@@ -144,6 +144,8 @@ class Chromap {
  // Supportive functions
  void ConstructIndex();
  int BandedAlignPatternToText(const char *pattern, const char *text, const int read_length, int *mapping_end_location);
  int BandedAlignPatternToTextWithDropOff(const char *pattern, const char *text, const int read_length, int *mapping_end_location);
  int BandedAlignPatternToTextWithDropOffFrom3End(const char *pattern, const char *text, const int read_length, int *mapping_end_position);
  void BandedAlign4PatternsToText(const char **patterns, const char *text, int read_length, int32_t *mapping_edit_distances, int32_t *mapping_end_positions);
  void BandedAlign8PatternsToText(const char **patterns, const char *text, int read_length, int16_t *mapping_edit_distances, int16_t *mapping_end_positions);
  void BandedTraceback(int min_num_errors, const char *pattern, const char *text, const int read_length, int *mapping_start_position);
@@ -151,8 +153,8 @@ class Chromap {
  void SupplementCandidates(const Index &index, uint32_t repetitive_seed_length1, uint32_t repetitive_seed_length2, std::vector<std::pair<uint64_t, uint64_t> > &minimizers1, std::vector<std::pair<uint64_t, uint64_t> > &minimizers2, std::vector<uint64_t> &positive_hits1, std::vector<uint64_t> &positive_hits2, std::vector<Candidate> &positive_candidates1, std::vector<Candidate> &positive_candidates2, std::vector<Candidate> &positive_candidates1_buffer, std::vector<Candidate> &positive_candidates2_buffer, std::vector<uint64_t> &negative_hits1, std::vector<uint64_t> &negative_hits2, std::vector<Candidate> &negative_candidates1, std::vector<Candidate> &negative_candidates2, std::vector<Candidate> &negative_candidates1_buffer, std::vector<Candidate> &negative_candidates2_buffer);
  void PostProcessingInLowMemory(uint32_t num_mappings_in_mem, uint32_t num_reference_sequences, const SequenceBatch &reference);
  void VerifyCandidatesOnOneDirectionUsingSIMD(Direction candidate_direction, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<Candidate> &candidates, std::vector<std::pair<int, uint64_t> > *mappings, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void VerifyCandidatesOnOneDirection(Direction candidate_direction, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<Candidate> &candidates, std::vector<std::pair<int, uint64_t> > *mappings, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void VerifyCandidates(const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, const std::vector<Candidate> &positive_candidates, const std::vector<Candidate> &negative_candidates, std::vector<std::pair<int, uint64_t> > *positive_mappings, std::vector<std::pair<int, uint64_t> > *negative_mappings, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void VerifyCandidatesOnOneDirection(Direction candidate_direction, const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<Candidate> &candidates, std::vector<std::pair<int, uint64_t> > *mappings, std::vector<int> *split_sites, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void VerifyCandidates(const SequenceBatch &read_batch, uint32_t read_index, const SequenceBatch &reference, const std::vector<std::pair<uint64_t, uint64_t> > &minimizers, const std::vector<Candidate> &positive_candidates, const std::vector<Candidate> &negative_candidates, std::vector<std::pair<int, uint64_t> > *positive_mappings, std::vector<int> *positive_split_sites, std::vector<std::pair<int, uint64_t> > *negative_mappings, std::vector<int> *negative_split_sites, int *min_num_errors, int *num_best_mappings, int *second_min_num_errors, int *num_second_best_mappings);
  void AllocateMultiMappings(uint32_t num_reference_sequences);
  void RemovePCRDuplicate(uint32_t num_reference_sequences);
  uint32_t MoveMappingsInBuffersToMappingContainer(uint32_t num_reference_sequences, std::vector<std::vector<std::vector<MappingRecord> > > *mappings_on_diff_ref_seqs_for_diff_threads_for_saving);
@@ -217,6 +219,7 @@ class Chromap {
  bool allocate_multi_mappings_;
  bool only_output_unique_mappings_;
  bool Tn5_shift_;
  bool split_alignment_;
  bool output_mapping_in_BED_;
  bool output_mapping_in_TagAlign_;
  bool output_mapping_in_PAF_;
+12 −1
Original line number Diff line number Diff line
@@ -502,7 +502,7 @@ static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar,
	return cigar;
}

int ksw_semi_global2(int qlen, const char *query, int tlen, const char *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar_, uint32_t **cigar_)
int ksw_semi_global3(int qlen, const char *query, int tlen, const char *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar_, uint32_t **cigar_, int *mapping_start_position, int *mapping_end_position)
{
	eh_t *eh;
	int8_t *qp; // query profile
@@ -597,6 +597,9 @@ int ksw_semi_global2(int qlen, const char *query, int tlen, const char *target,
      score = eh[qlen - j].h;
      max_score_position = qlen - j;
    }
  }
  if (mapping_end_position) {
    *mapping_end_position = max_score_position;
  }
	if (n_cigar_ && cigar_) { // backtrack
		int n_cigar = 0, m_cigar = 0, which = 0;
@@ -611,6 +614,9 @@ int ksw_semi_global2(int qlen, const char *query, int tlen, const char *target,
			else                 cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k;
		}
		if (i >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1);
    if (mapping_start_position) {
      *mapping_start_position = k;
    }
		//if (k >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1);
		for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
			tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;
@@ -625,6 +631,11 @@ int ksw_semi_global(int qlen, const char *query, int tlen, const char *target, i
	return ksw_semi_global2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, n_cigar_, cigar_);
}

int ksw_semi_global2(int qlen, const char *query, int tlen, const char *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar_, uint32_t **cigar_)
{
	return ksw_semi_global3(qlen, query, tlen, target, m, mat, o_del, e_del, o_ins, e_ins, w, n_cigar_, cigar_, NULL, NULL);
}

int ksw_global2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar_, uint32_t **cigar_)
{
	eh_t *eh;
+1 −0
Original line number Diff line number Diff line
@@ -84,6 +84,7 @@ extern "C" {
	int ksw_global2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar, uint32_t **cigar);
	int ksw_semi_global(int qlen, const char *query, int tlen, const char *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar, uint32_t **cigar);
	int ksw_semi_global2(int qlen, const char *query, int tlen, const char *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar, uint32_t **cigar);
	int ksw_semi_global3(int qlen, const char *query, int tlen, const char *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar, uint32_t **cigar, int *mapping_start_position, int *mapping_end_position);

	/**
	 * Extend alignment