2.9.4 UB tag write speed (4e23bf92) · Commits · github_fork / ZUMIs

README.md

+2 −0

Original line number	Diff line number	Diff line
		@@ -31,6 +31,8 @@ We provide a script to convert zUMIs output into loom file automatically based o
		zUMIs will try to automatically do this, otherwise convert zUMIs output to loom by simply running `Rscript rds2loom.R myRun.yaml`.

		## Changelog
		12 Sept 2020: [zUMIs2.9.4](https://github.com/sdparekh/zUMIs/releases/tag/2.9.4): Speed writing of error-corrected UMI tags to bam file up significantly. Prevent potential crash when no cells meet any user-defined downsampling criteria.

		19 July 2020: [zUMIs2.9.3](https://github.com/sdparekh/zUMIs/releases/tag/2.9.3): Add zUMIs version number to header of unmapped bam files. Several bug fixes: prevent error during mapping with memory handling; incorrect Smart-seq3 UMI-fragment counting.

		14 July 2020: [zUMIs2.9.2](https://github.com/sdparekh/zUMIs/releases/tag/2.9.2): Several bug fixes: Prevent RAM from ballooning, issues with resuming from different stage. Speed up demultiplexing further by chrosome-wise operations. Remove need for second bam file sorting after hamming collapse by keeping sort order.

UMIstuffFUN.R

+50 −34

Original line number	Diff line number	Diff line
		@@ -334,6 +334,22 @@ write_molecule_mapping <- function(mm){
		}
		}

		correct_UB_tags_new <- function(inbamfile,n){
		mm_path <- paste0(opt$out_dir,"/zUMIs_output/molecule_mapping/",n,".")
		outbamfile <-paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.UBcorrected.sorted.bam")
		bcpath <- paste0(opt$out_dir,"/zUMIs_output/",opt$project,"kept_barcodes_binned.txt")
		use_threads <- opt$num_threads
		pypath <- paste0(opt$zUMIs_directory,"/correct_UBtag.py")
		UBcmd <- paste("python3", pypath,
		"--bam",inbamfile,
		"--out",outbamfile,
		"--p",use_threads,
		"--bcs",bcpath,
		"--stub",mm_path)
		system(UBcmd)
		return(outbamfile)
		}

		correct_UB_tags <- function(bccount, samtoolsexc){
		mm_path <- paste0(opt$out_dir,"/zUMIs_output/molecule_mapping/")
		demux_path <- paste0(opt$out_dir,"/zUMIs_output/demultiplexed/")

barcodeIDFUN.R

+12 −8

Original line number	Diff line number	Diff line
		@@ -33,6 +33,10 @@ setDownSamplingOption<-function( down ,bccount, filename=NULL){
		}
		colnames(subsample.splits)<-c("minR","maxR")

		if(nrow(subsample.splits) == 0){
		subsample.splits <- setDownSamplingOption(down = "0", bccount = bccount, filename = filename)
		}

		return( subsample.splits )
		}

correct_UBtag.py

0 → 100644

+136 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env python3
		import os
		import pysam
		import argparse
		import multiprocessing as mp

		def collect_bam_chunks(inpath, chrs, outpath):
		allpaths = [inpath+".tmp."+c+".bam" for c in chrs[:-1]]
		allpaths.append(inpath+".tmp."+"unmapped"+".bam")
		cat_args = ['-o', outpath]+allpaths
		pysam.cat(*cat_args)
		x = [os.remove(f) for f in allpaths]
		#pysam.index(outpath)

		def load_bcs(bcpath):
		with open(bcpath) as f:
		x = f.readline() # remove header
		y = f.readlines()
		bc = []
		for l in y:
		l = l.split(',')
		bc.append(l[0])
		return(bc)

		def load_dict(stub, bcs):
		molecules_dict = {}
		for i in bcs:
		fp = stub+i+".txt"
		if os.path.exists(fp):
		molecules_dict[i] = {}
		with open(fp) as f:
		x = f.readline() # remove header
		y = f.readlines()
		for l in y:
		l = l.strip().split('\t')
		if l[3] not in molecules_dict[i]:
		molecules_dict[i][l[3]] = {}
		if l[0] not in molecules_dict[i][l[3]]:
		molecules_dict[i][l[3]][l[0]] = {}
		molecules_dict[i][l[3]][l[0]] = l[1]
		return(molecules_dict)

		# def return_UB(moldict, BC, GE, UX):
		# UB = UX
		# if BC in moldict:
		# if GE in moldict[BC]:
		# if UX in moldict[BC][GE]:
		# UB = moldict[BC][GE][UX]
		# return(UB)

		def return_UB(moldict, BC, GE, UX):
		try:
		UB = moldict[BC][GE][UX]
		except KeyError:
		UB = UX
		return(UB)

		def correct_tags(inpath, threads, chr):
		global mols
		#nreads = 0
		if chr == '*':
		chrlabel = 'unmapped'
		else:
		chrlabel = chr
		outpath = inpath+".tmp."+chrlabel+".bam"
		inp = pysam.AlignmentFile(inpath, 'rb', threads = threads)
		out = pysam.AlignmentFile(outpath, 'wb', template = inp, threads = threads)
		for read in inp.fetch(chr):
		#nreads += 1
		umi = read.get_tag('UB')
		cell = read.get_tag('BC')
		if read.has_tag('GE'):
		gene = read.get_tag('GE')
		else:
		gene = 'NA'
		read.set_tag(tag = 'UX', value = umi, value_type = 'Z')
		umi_new = return_UB(moldict = mols, BC = cell, GE = gene, UX = umi)
		read.set_tag(tag = 'UB', value = umi_new, value_type = 'Z')
		out.write(read)
		inp.close()
		out.close()
		#print("Number of reads processed: "+nreads)

		def main():
		parser = argparse.ArgumentParser(add_help=True)
		parser.add_argument('--bam', type=str, metavar='FILENAME',
		help='Path to input BAM file')
		parser.add_argument('--out', type=str, metavar='FILENAME',
		help='Path to output bam file')
		parser.add_argument('--p', type=int, default = 10,
		help='Number of processes for bams')
		parser.add_argument('--bcs', type=str, metavar='FILENAME',
		help='Path to kept barcodes')
		parser.add_argument('--stub', type=str, metavar='FILENAME',
		help='Molecule table path stub')

		args = parser.parse_args()

		####
		#bcs = load_bcs('zUMIs_output/hSkinkept_barcodes.txt')
		#mols = load_dict('zUMIs_output/molecule_mapping/hSkin.', bcs)
		#bam = 'hSkin.filtered.Aligned.GeneTagged.sorted.bam'
		#chrs = pysam.idxstats(bam).split('\n')
		#chrs = [c.split('\t')[0] for c in chrs[:-1]]
		#pysam_workers = 3
		#n_jobs = 10
		#pool = mp.Pool(n_jobs)
		#results = [pool.apply_async(correct_tags, (bam,pysam_workers,chr, )) for chr in chrs]
		####


		bcs = load_bcs(args.bcs)
		print("Loading molecule correction dictionary...")
		global mols
		mols = load_dict(args.stub, bcs)
		print("Correcting UB tags...")

		chrs = pysam.idxstats(args.bam).split('\n')
		chrs = [c.split('\t')[0] for c in chrs[:-1]]

		if args.p > 8:
		pysam_workers = 2
		n_jobs = int(args.p/2)
		else:
		pysam_workers = 1
		n_jobs = args.p

		pool = mp.Pool(n_jobs)
		results = [pool.apply_async(correct_tags, (args.bam,pysam_workers,chr, )) for chr in chrs]
		x = [r.get() for r in results]


		collect_bam_chunks(inpath = args.bam, chrs = chrs, outpath = args.out)

		if __name__ == "__main__":
		main()

zUMIs-dge2.R

+6 −4

Original line number	Diff line number	Diff line
		@@ -161,10 +161,12 @@ if(opt$counting_opts$Ham_Dist == 0){
		reads <- reads[!UB==""] #make sure only UMI-containing reads go further
		u <- umiCollapseHam(reads,bccount, HamDist=opt$counting_opts$Ham_Dist)
		}
		print("Demultiplexing output bam file by cell barcode...")
		demultiplex_bam(opt, outbamfile, nBCs = length(unique(bccount$XC)), bccount = bccount, samtoolsexc = samtoolsexc)
		#print("Demultiplexing output bam file by cell barcode...")
		#demultiplex_bam(opt, outbamfile, nBCs = length(unique(bccount$XC)), bccount = bccount, samtoolsexc = samtoolsexc)
		print("Correcting UMI barcode tags...")
		sortbamfile <- correct_UB_tags(bccount, samtoolsexc)
		sortbamfile <- correct_UB_tags_new(outbamfile, opt$project)
		file.remove(outbamfile)
		#sortbamfile <- correct_UB_tags(bccount, samtoolsexc)
		#sortbamfile <-paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.UBcorrected.sorted.bam")
		bccount<-splitRG(bccount=bccount, mem= opt$mem_limit, hamdist = 0) # allow more reads to be in RAM fur subsequent steps
		}
		@@ -274,7 +276,7 @@ if(opt$counting_opts$intronProb == TRUE){
		}

		#demultiplexing
		if(opt$counting_opts$Ham_Dist == 0 && opt$barcodes$demultiplex == TRUE ){ #otherwise its already demultiplexed!
		if(opt$barcodes$demultiplex){
		print("Demultiplexing output bam file by cell barcode...")
		demultiplex_bam(opt, sortbamfile, nBCs = length(unique(bccount$XC)), bccount = bccount, samtoolsexc = samtoolsexc)
		}

Admin message