small improvements (1e6ace96) · Commits · github_fork / ZUMIs

README.md

+1 −1

Original line number	Original line	Diff line number	Diff line
	@@ -31,7 +31,7 @@ We provide a script to convert zUMIs output into loom file automatically based o
	zUMIs will try to automatically do this, otherwise convert zUMIs output to loom by simply running `Rscript rds2loom.R myRun.yaml`.		zUMIs will try to automatically do this, otherwise convert zUMIs output to loom by simply running `Rscript rds2loom.R myRun.yaml`.

	## Changelog		## Changelog
	17 Sept 2020: zUMIs.2.9.4b: Fix Smart-seq3 UMI read counting		18 Sept 2020: zUMIs.2.9.4b/c: Fix Smart-seq3 UMI read counting. Prevent crash when a chunk of cell BCs does not match any downsampling. Speed up barcode detection steps for some cases.

	12 Sept 2020: [zUMIs2.9.4](https://github.com/sdparekh/zUMIs/releases/tag/2.9.4): Speed writing of error-corrected UMI tags to bam file up significantly. Prevent potential crash when no cells meet any user-defined downsampling criteria.		12 Sept 2020: [zUMIs2.9.4](https://github.com/sdparekh/zUMIs/releases/tag/2.9.4): Speed writing of error-corrected UMI tags to bam file up significantly. Prevent potential crash when no cells meet any user-defined downsampling criteria.

UMIstuffFUN.R

+30 −19

Original line number	Original line	Diff line number	Diff line
	@@ -282,8 +282,10 @@ check_nonUMIcollapse <- function(seqfiles){
	}		}

	collectCounts<-function(reads,bccount,subsample.splits, mapList, ...){		collectCounts<-function(reads,bccount,subsample.splits, mapList, ...){
	subNames<-paste("downsampled",rownames(subsample.splits),sep="_")
	umiFUN<-"umiCollapseID"		umiFUN<-"umiCollapseID"

			if(nrow(subsample.splits)>0){
			subNames<-paste("downsampled",rownames(subsample.splits),sep="_")
	parallel::mclapply(mapList,function(tt){		parallel::mclapply(mapList,function(tt){
	ll<-list( all=umiFUNs[[umiFUN]](reads=reads,		ll<-list( all=umiFUNs[[umiFUN]](reads=reads,
	bccount=bccount,		bccount=bccount,
	@@ -299,17 +301,26 @@ collectCounts<-function(reads,bccount,subsample.splits, mapList, ...){
	names(ll$downsampling)<-subNames		names(ll$downsampling)<-subNames
	ll		ll
	}, mc.cores = length(mapList))		}, mc.cores = length(mapList))
			}else{
			parallel::mclapply(mapList,function(tt){
			ll<-list( all=umiFUNs[[umiFUN]](reads=reads,
			bccount=bccount,
			ftype=tt))
			ll
			}, mc.cores = length(mapList))
			}
	}		}


	bindList<-function(alldt,newdt){		bindList<-function(alldt,newdt){
	for( i in names(alldt)){		for( i in names(alldt)){
	alldt[[i]][[1]]<-rbind(alldt[[i]][[1]], newdt[[i]][[1]] )		alldt[[i]][[1]]<-rbind(alldt[[i]][[1]], newdt[[i]][[1]] )
			if("downsampling" %in% names(newdt[[i]])){
	for(j in names(alldt[[i]][[2]])){		for(j in names(alldt[[i]][[2]])){
	alldt[[i]][[2]][[j]]<-rbind(alldt[[i]][[2]][[j]],newdt[[i]][[2]][[j]])		alldt[[i]][[2]][[j]]<-rbind(alldt[[i]][[2]][[j]],newdt[[i]][[2]][[j]])
	}		}
	}		}
			}
	return(alldt)		return(alldt)
	}		}

barcodeIDFUN.R

+25 −4

Original line number	Original line	Diff line number	Diff line
	@@ -162,8 +162,29 @@ setDownSamplingOption<-function( down ,bccount, filename=NULL){
	if(nchar(bccount[keep==TRUE,XC][1]) != nchar(bc_wl[1])){		if(nchar(bccount[keep==TRUE,XC][1]) != nchar(bc_wl[1])){
	print("length of barcodes not equal to given barcode list, trying to match up...")		print("length of barcodes not equal to given barcode list, trying to match up...")
	search_vector <- bccount[keep==TRUE,XC]		search_vector <- bccount[keep==TRUE,XC]

			#first check if a partial barcode matches the length of the whitelist
			bc_definition <- sapply(opt$sequence_files, function(x) grep("BC", x$base_definition, value = T))
			if(length(bc_definition)>1){ #this only makes sense if there are at least 2 BC pieces defined
			bc_definition <- sapply(bc_definition, function(x) substr(x = x, start = 4, stop = nchar(x)-1))
			bc_len_mat <- t(matrix(as.numeric(unlist(strsplit(bc_definition, "-"))), ncol = length(bc_definition)))
			bc_lens <- bc_len_mat[,2] - bc_len_mat[,1] + 1 #parse barcode definition to get length of barcode pieces
			if(sum(bc_lens == nchar(bc_wl[1])) == 1){ #if one piece matches exactly the whitelist bc length
			match_piece <- which(bc_lens == nchar(bc_wl[1]))
			sub_start = 1
			sub_stop = bc_lens[match_piece]
			if(match_piece > 1){
			sub_start = sub_start + bc_lens[1:(match_piece-1)]
			sub_stop = sub_stop + bc_lens[1:(match_piece-1)]
			}
			XC_vector_substring <- substr(search_vector, start = sub_start, stop = sub_stop)
			bc_matched <- which(XC_vector_substring %in% bc_wl)
			}
			}else{
	bc_matched <- parallel::mcmapply(function(x) grep(pattern = x, x =search_vector), bc_wl, mc.cores = opt$num_threads, mc.preschedule = TRUE)		bc_matched <- parallel::mcmapply(function(x) grep(pattern = x, x =search_vector), bc_wl, mc.cores = opt$num_threads, mc.preschedule = TRUE)
	bc_matched <- unlist(bc_matched)		bc_matched <- unlist(bc_matched)
			}

	if(length(bc_matched)>0){		if(length(bc_matched)>0){
	to_remove <- search_vector[-bc_matched]		to_remove <- search_vector[-bc_matched]
	bccount[XC %in% to_remove,keep:=FALSE]		bccount[XC %in% to_remove,keep:=FALSE]
	@@ -242,8 +263,8 @@ BCbin <- function(bccount_file, bc_detected) {
	nocell_BCs <- nocell_bccount[,XC]		nocell_BCs <- nocell_bccount[,XC]

	if(opt$barcodes$BarcodeBinning>0){		if(opt$barcodes$BarcodeBinning>0){
	#break up in pieces of 1000 real BCs in case the hamming distance calculation gets too large!		#break up in pieces of 2000 real BCs in case the hamming distance calculation gets too large!
	true_chunks <- split(true_BCs, ceiling(seq_along(true_BCs)/1000))		true_chunks <- split(true_BCs, ceiling(seq_along(true_BCs)/2000))
	for(i in 1:length(true_chunks)){		for(i in 1:length(true_chunks)){
	dists <- stringdist::stringdistmatrix(true_chunks[[i]],nocell_BCs,method="hamming", nthread = opt$num_threads)		dists <- stringdist::stringdistmatrix(true_chunks[[i]],nocell_BCs,method="hamming", nthread = opt$num_threads)
	dists <- setDT(data.frame(dists))		dists <- setDT(data.frame(dists))

zUMIs.sh

+1 −1

Original line number	Original line	Diff line number	Diff line
	@@ -3,7 +3,7 @@
	# Pipeline to run UMI-seq analysis from fastq to read count tables.		# Pipeline to run UMI-seq analysis from fastq to read count tables.
	# Authors: Swati Parekh, Christoph Ziegenhain, Beate Vieth & Ines Hellmann		# Authors: Swati Parekh, Christoph Ziegenhain, Beate Vieth & Ines Hellmann
	# Contact: sparekh@age.mpg.de or christoph.ziegenhain@ki.se		# Contact: sparekh@age.mpg.de or christoph.ziegenhain@ki.se
	vers=2.9.4b		vers=2.9.4c
	currentv=$(curl -s https://raw.githubusercontent.com/sdparekh/zUMIs/main/zUMIs.sh \| grep '^vers=' \| cut -f2 -d "=")		currentv=$(curl -s https://raw.githubusercontent.com/sdparekh/zUMIs/main/zUMIs.sh \| grep '^vers=' \| cut -f2 -d "=")
	if [ "$currentv" != "$vers" ] ; then		if [ "$currentv" != "$vers" ] ; then
	echo -e "------------- \n\n Good news! A newer version of zUMIs is available at https://github.com/sdparekh/zUMIs \n\n-------------";		echo -e "------------- \n\n Good news! A newer version of zUMIs is available at https://github.com/sdparekh/zUMIs \n\n-------------";

Admin message