zUMIs2.9.0: speed improvements (c06bcaa0) · Commits · github_fork / ZUMIs

README.md

+3 −1

Original line number	Diff line number	Diff line
		@@ -8,7 +8,7 @@ git clone https://github.com/sdparekh/zUMIs.git`
		```
		Start anaylysing your data:
		```
		zUMIs/zUMIs-master.sh -c -y my_config.yaml
		zUMIs/zUMIs.sh -c -y my_config.yaml
		```
		zUMIs now comes with its own [miniconda](https://docs.conda.io/en/latest/miniconda.html) environment, so you do not need to deal with dependencies or installations (use the -c flag to use conda).
		If you wish to use your own dependencies, head to the [zUMIs wiki](https://github.com/sdparekh/zUMIs/wiki/Installation#dependencies) to see what is required.
		@@ -29,6 +29,8 @@ We provide a script to convert zUMIs output into loom file automatically based o
		zUMIs will try to automatically do this, otherwise convert zUMIs output to loom by simply running `Rscript rds2loom.R myRun.yaml`.

		## Changelog
		05 July 2020: [zUMIs2.9.0 released](https://github.com/sdparekh/zUMIs/releases/tag/2.7.0): Speed up STAR read mapping by parallel instances if enough resources are available. Change the main zUMIs script to `zUMIs.sh`. Speed up and reduce clutter by loading reads from bam files using parallelised Rsamtools calls instead of printing temporary text files. Speed up counting by parallelising exon / intron / exon+intron counting as well as downsamplings. Speed up by parallelising creation of wide format count matrices.

		23 June 2020: zUMIs2.8.3: Merged code contribution from @gringer: prevent errors by emitting SAM headers in chunked unmapped .bam file output of fqfilter. Changed call to STAR to prevent stalling of samtools pipe.

		18 May 2020: zUMIs2.8.2: Added `merge_demultiplexed_fastq.R` to concatenate previously demultiplexed fastq files. For usage details see here: https://github.com/sdparekh/zUMIs/wiki/Starting-from-demultiplexed-fastq-files

UMIstuffFUN.R

+52 −91

Original line number	Diff line number	Diff line
		@@ -56,96 +56,55 @@ ham_mat <- function(umistrings) {
		nrow(X) - H
		}

		prep_samtools <- function(featfile,bccount,inex,cores,samtoolsexc){
		print("Extracting reads from bam file(s)...")

		nchunks <- length(unique(bccount$chunkID))
		all_rgfiles <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".RGgroup.",1:nchunks,".txt")


		for(i in unique(bccount$chunkID)){
		rgfile <- all_rgfiles[i]
		chunks <- bccount[chunkID==i]$XC
		write.table(file=rgfile,chunks,col.names = F,quote = F,row.names = F)
		}

		reads2genes_new <- function(featfile, bccount, inex, chunk, cores, keepUnassigned = FALSE){
		chunk_bcs <- bccount[chunkID==chunk]$XC
		idxstats <- Rsamtools::idxstatsBam(featfile)
		taglist <- c("BC", "UB","GE")
		if(inex){
		headerXX <- paste( c(paste0("V",1:4)) ,collapse="\t")
		}else{
		headerXX <- paste( c(paste0("V",1:3)) ,collapse="\t")
		taglist <- c(taglist, "GI")
		}
		write(headerXX,"freadHeader")

		headercommand <- "cat freadHeader > "
		layoutflag <- ifelse(opt$read_layout == "PE", "-f 0x0040", "")
		samcommand <- paste(samtoolsexc," view -x QB -x QU -x BX -x NH -x AS -x nM -x HI -x IH -x NM -x uT -x MD -x jM -x jI -x XN -x XS -x UX -x ES -x EN -x IS -x IN", layoutflag, "-@")

		outfiles <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".tmp.",1:nchunks,".txt")
		system(paste(headercommand,outfiles,collapse = "; "))

		cpusperchunk <- round(cores/nchunks,0)

		if(inex == FALSE){
		grepcommand <- " \| cut -f12,13,14 \| sed 's/BC:Z://' \| sed 's/GE:Z://' \| sed 's/UB:Z://' \| grep -F -f "
		ex_cmd <- paste(samcommand,cpusperchunk,featfile,grepcommand,all_rgfiles,">>",outfiles," & ",collapse = " ")

		system(paste(ex_cmd,"wait"))
		rsamtools_reads <- mclapply(1:nrow(idxstats), function(x) {
		if(opt$read_layout == "PE"){
		parms <- ScanBamParam(tag=taglist,
		what="pos",
		flag = scanBamFlag(isFirstMateRead = TRUE),
		tagFilter = list(BC = chunk_bcs),
		which = GRanges(seqnames = idxstats[x,"seqnames"], ranges = IRanges(1,idxstats[x,"seqlength"])))
		}else{
		grepcommand <- " \| cut -f12,13,14,15 \| sed 's/BC:Z://' \| sed 's/Z://g' \| grep -F -f "
		inex_cmd <- paste(samcommand,cpusperchunk,featfile,grepcommand,all_rgfiles,">>",outfiles," & ",collapse = " ")

		system(paste(inex_cmd,"wait"))
		parms <- ScanBamParam(tag=taglist,
		what="pos",
		tagFilter = list(BC = chunk_bcs),
		which = GRanges(seqnames = idxstats[x,"seqnames"], ranges = IRanges(1,idxstats[x,"seqlength"])))
		}
		system("rm freadHeader")
		system(paste("rm",all_rgfiles))

		return(outfiles)
		dat <- scanBam(file = featfile, param = parms)
		if(inex){
		dt <- data.table(RG = dat[[1]]$tag$BC, UB = dat[[1]]$tag$UB, GE = dat[[1]]$tag$GE, GEin = dat[[1]]$tag$GI)
		}else{
		dt <- data.table(RG = dat[[1]]$tag$BC, UB = dat[[1]]$tag$UB, GE = dat[[1]]$tag$GE)
		}

		reads2genes <- function(inex,chunkID, keepUnassigned = FALSE){

		samfile <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".tmp.",chunkID,".txt")

		if(inex==FALSE){
		reads<-data.table::fread(samfile, na.strings=c(""),
		select=c(1,2,3),header=T,fill=T,colClasses = "character" , col.names = c("RG","GE","UB") )[
		,"ftype":="NA"
		][is.na(GE)==F, ftype:="exon"]
		return(dt)
		}, mc.cores = cores)
		rsamtools_reads <- rbindlist(rsamtools_reads, fill = TRUE, use.names = TRUE)
		if(inex){
		rsamtools_reads[ , ftype :="NA"][
		is.na(GEin)==F, ftype :="intron"][
		is.na(GE)==F , ftype:="exon"][
		is.na(GE) , GE:=GEin][
		,GEin:=NULL ]
		}else{
		reads<-data.table::fread(samfile, na.strings=c(""),
		select=c(1,2,3,4),header=T,fill=T,colClasses = "character" , col.names = c("RG","V2","V3","V4") )[
		, V2_id := substr(V2,1,2)
		][ , V3_id := substr(V3,1,2)
		][ , V4_id := substr(V4,1,2)
		][ , V2 := substr(V2,4,nchar(V2))
		][ , V3 := substr(V3,4,nchar(V3))
		][ , V4 := substr(V4,4,nchar(V4))
		][ V2_id == "GE", GE := V2
		][ V2_id == "GI", GEin := V2
		][ V2_id == "UB", UB := V2
		][ V3_id == "GE", GE := V3
		][ V3_id == "GI", GEin := V3
		][ V3_id == "UB", UB := V3
		][ V4_id == "UB", UB := V4
		][ V4_id == "GI", GEin := V4
		][ ,c("V2_id","V3_id","V4_id","V2","V3","V4") := NULL
		][ ,"ftype":="NA"
		][is.na(GEin)==F,ftype:="intron"
		][is.na(GE)==F, ftype:="exon"
		][is.na(GE),GE:=GEin
		][ ,GEin:=NULL ]

		rsamtools_reads[, ftype :="NA"][
		is.na(GE)==F, ftype :="exon"]
		}
		system(paste("rm",samfile))

		setkey(reads,RG)
		setkey(rsamtools_reads,RG)

		if(keepUnassigned){
		return( reads )
		return( rsamtools_reads )
		}else{
		return( reads[GE!="NA"] )
		return( rsamtools_reads[GE!="NA"] )
		}

		}

		hammingFilter<-function(umiseq, edit=1, gbcid=NULL){
		@@ -322,19 +281,19 @@ check_nonUMIcollapse <- function(seqfiles){
		collectCounts<-function(reads,bccount,subsample.splits, mapList, ...){
		subNames<-paste("downsampled",rownames(subsample.splits),sep="_")
		umiFUN<-"umiCollapseID"
		lapply(mapList,function(tt){
		parallel::mclapply(mapList,function(tt){
		ll<-list( all=umiFUNs[[umiFUN]](reads=reads,
		bccount=bccount,
		ftype=tt),
		downsampling=lapply( 1:nrow(subsample.splits) , function(i){
		downsampling=parallel::mclapply( 1:nrow(subsample.splits) , function(i){
		umiFUNs[[umiFUN]](reads,bccount,
		nmin=subsample.splits[i,1],
		nmax=subsample.splits[i,2],
		ftype=tt)} )
		ftype=tt)}, mc.cores = floor(opt$num_threads/length(mapList)) )
		)
		names(ll$downsampling)<-subNames
		ll
		})
		}, mc.cores = length(mapList))

		}

		@@ -349,16 +308,18 @@ bindList<-function(alldt,newdt){
		}

		convert2countM<-function(alldt,what){
		fmat<-alldt
		fmat<-copy(alldt)
		for( i in 1:length(alldt)){
		fmat[[i]][[1]]<-.makewide(alldt[[i]][[1]],what)
		for(j in names(alldt[[i]][[2]])){
		fmat[[i]][[2]][[j]]<-.makewide(alldt[[i]][[2]][[j]],what)
		}
		fmat[[i]][[2]] <- fmat[[i]][[2]][sapply(fmat[[i]][[2]], function(x) nrow(x)>0)]
		downsamp_names <- names(fmat[[i]][[2]])
		fmat[[i]][[2]] <- parallel::mclapply(downsamp_names, function(x){
		.makewide(alldt[[i]][[2]][[x]],what)
		}, mc.cores = opt$num_threads)
		names(fmat[[i]][[2]]) <- downsamp_names
		}
		return(fmat)
		}

		write_molecule_mapping <- function(mm){
		mm_path <- paste0(opt$out_dir,"/zUMIs_output/molecule_mapping/")
		bcs <- unique(mm$BC)

zUMIs-dge2.R

+21 −17

Original line number	Diff line number	Diff line
		@@ -3,6 +3,7 @@ library(methods)
		library(data.table)
		library(yaml)
		library(ggplot2)
		suppressPackageStartupMessages(library(Rsamtools))

		##########################
		myYaml <- commandArgs(trailingOnly = T)
		@@ -126,15 +127,22 @@ if(opt$counting_opts$Ham_Dist == 0){
		dir.create( paste0(opt$out_dir,"/zUMIs_output/molecule_mapping/") )
		}

		samouts <- prep_samtools(featfile = outbamfile,
		bccount = bccount,
		inex = opt$counting_opts$introns,
		cores = opt$num_threads,
		samtoolsexc=samtoolsexc)
		tmpbamfile <- outbamfile
		outbamfile <- paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.sorted.bam")
		print("Coordinate sorting intermediate bam file...")
		sort_cmd <- paste0(samtoolsexc," sort -O 'BAM' -@ ",opt$num_threads," -m ",mempercpu,"G -o ",outbamfile," ",tmpbamfile)
		system(sort_cmd)
		index_cmd <- paste(samtoolsexc,"index -@",opt$num_threads,outbamfile)
		system(index_cmd)
		system(paste0("rm ",tmpbamfile))

		for(i in unique(bccount$chunkID)){
		print( paste( "Hamming distance collapse in barcode chunk", i, "out of",length(unique(bccount$chunkID)) ))
		reads<-reads2genes( inex = opt$counting_opts$introns,
		chunkID = i )
		reads <- reads2genes_new(featfile = outbamfile,
		bccount = bccount,
		inex = opt$counting_opts$introns,
		chunk = i,
		cores = opt$num_threads)
		reads <- reads[!UB==""] #make sure only UMI-containing reads go further
		u <- umiCollapseHam(reads,bccount, HamDist=opt$counting_opts$Ham_Dist)
		}
		@@ -144,7 +152,7 @@ if(opt$counting_opts$Ham_Dist == 0){
		outbamfile <- correct_UB_tags(bccount, samtoolsexc)
		sortbamfile <-paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.UBcorrected.sorted.bam")
		}
		paste("Coordinate sorting final bam file...")
		print("Coordinate sorting final bam file...")
		sort_cmd <- paste0(samtoolsexc," sort -O 'BAM' -@ ",opt$num_threads," -m ",mempercpu,"G -o ",sortbamfile," ",outbamfile)
		system(sort_cmd)
		index_cmd <- paste(samtoolsexc,"index -@",opt$num_threads,sortbamfile)
		@@ -177,18 +185,14 @@ if( opt$counting_opts$introns ){

		########################## assign reads to UB & GENE

		samouts <- prep_samtools(featfile = sortbamfile,
		bccount = bccount,
		inex = opt$counting_opts$introns,
		cores = opt$num_threads,
		samtoolsexc=samtoolsexc)

		for(i in unique(bccount$chunkID)){
		print( paste( "Working on barcode chunk", i, "out of",length(unique(bccount$chunkID)) ))
		print( paste( "Processing",length(bccount[chunkID==i]$XC), "barcodes in this chunk..." ))
		reads<-reads2genes( inex = opt$counting_opts$introns,
		chunkID = i )

		reads <- reads2genes_new(featfile = sortbamfile,
		bccount = bccount,
		inex = opt$counting_opts$introns,
		chunk = i,
		cores = opt$num_threads)

		tmp<-collectCounts( reads =reads,
		bccount=bccount[chunkID==i],

zUMIs-mapping.R

+52 −17

Original line number	Diff line number	Diff line
		@@ -10,6 +10,10 @@ additional_fq <- inp$reference$additional_files
		samtools <- inp$samtools_exec
		STAR_exec <- inp$STAR_exec

		if(is.null(inp$mem_limit) \| inp$mem_limit == 0){
		inp$mem_limit <- 100
		}

		# collect filtered bam files ----------------------------------------------
		tmpfolder <- paste(inp$out_dir,"/zUMIs_output/.tmpMerge/",sep="")
		if(inp$which_Stage == "Filtering"){
		@@ -21,6 +25,12 @@ if(inp$which_Stage == "Filtering"){
		}


		# check if multiple STAR instances can be run -----------------------------

		genome_size <- system(command = paste("du -sh",inp$reference$STAR_index,"\| cut -f1"), intern = TRUE)
		genome_size <- as.numeric(gsub(pattern = "G",replacement = "", x = genome_size))
		num_star_instances <- floor(inp$mem_limit/genome_size)

		# GTF file setup ----------------------------------------------------------
		#in case of additional sequences, we need to create a custom GTF

		@@ -77,24 +87,18 @@ cDNA_read_length <- getmode(nchar(cDNA_peek$V1))
		# Setup STAR mapping ------------------------------------------------------
		samtools_load_cores <- ifelse(inp$num_threads>8,2,1)
		avail_cores <- inp$num_threads - samtools_load_cores #reserve threads for samtools file opening
		#if(avail_cores < 3){
		# cores_samtools <- 1
		# cores_star <- 2
		#}else{
		# cores_samtools <- floor(avail_cores/3)
		# cores_star <- avail_cores - cores_samtools
		#}
		if(inp$which_Stage == "Filtering"){
		avail_cores <- floor(avail_cores / num_star_instances)
		}

		if(avail_cores < 2){
		avail_cores = 1
		}

		#param_defaults <- paste("--readFilesCommand ",samtools," view -@",samtools_load_cores," --outSAMmultNmax 1 --outFilterMultimapNmax 50 --outSAMunmapped Within --outSAMtype SAM --outStd SAM")
		param_defaults <- paste("--readFilesCommand ",samtools," view -@",samtools_load_cores," --outSAMmultNmax 1 --outFilterMultimapNmax 50 --outSAMunmapped Within --outSAMtype BAM Unsorted")
		param_defaults <- paste("--readFilesCommand ",samtools," view -@",samtools_load_cores," --outSAMmultNmax 1 --outFilterMultimapNmax 50 --outSAMunmapped Within --outSAMtype BAM Unsorted --quantMode TranscriptomeSAM")
		param_misc <- paste("--genomeDir",inp$reference$STAR_index,
		"--sjdbGTFfile",gtf_to_use,
		"--runThreadN",avail_cores,
		"--readFilesIn",paste0(filtered_bams,collapse=","),
		"--outFileNamePrefix",paste(inp$out_dir,"/",inp$project,".filtered.tagged.",sep=""),
		"--sjdbOverhang", cDNA_read_length-1,
		"--readFilesType SAM",inp$read_layout)

		@@ -103,14 +107,45 @@ if(inp$counting_opts$twoPass==TRUE){
		STAR_command <- paste(STAR_command,"--twopassMode Basic")
		}

		#samtools_output <- paste0(" \| ",samtools," view -@ ",cores_samtools," -o ",inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.out.bam")
		#STAR_command <- paste(STAR_command,samtools_output)

		#finally, run STAR
		if(num_star_instances>1 & inp$which_Stage == "Filtering"){
		map_tmp_dir <- paste0(inp$out_dir,"/zUMIs_output/.tmpMap/")
		dir.create(path = map_tmp_dir)
		input_split <- split(filtered_bams, ceiling(seq_along(filtered_bams) / ceiling(length(filtered_bams) / num_star_instances)))
		input_split <- sapply(input_split, paste0, collapse = ",")
		STAR_preset <- STAR_command
		STAR_command <- lapply(seq(num_star_instances), function(x){
		paste(STAR_preset,
		"--readFilesIn",input_split[x],
		"--outFileNamePrefix",paste0(map_tmp_dir,"/tmp.",inp$project,".",x,"."))
		})
		STAR_command <- paste(unlist(STAR_command), collapse = " & ")
		system(paste(STAR_command,"&",sammerge_command,"& wait"))

		#after parallel instance STAR, collect output data in the usual file places
		out_logs <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Log.final.out"), full = TRUE)
		merge_logs <- paste("cat",paste(out_logs, collapse = " "),">",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Log.final.out"))
		out_bams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Aligned.out.bam"), full = TRUE)
		merge_bams <- paste(inp$samtools_exec,"cat -o",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.out.bam"),paste(out_bams, collapse = " "))
		out_txbams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Aligned.toTranscriptome.out.bam"), full = TRUE)
		merge_txbams <- paste(inp$samtools_exec,"cat -o",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.toTranscriptome.out.bam"),paste(out_txbams, collapse = " "))
		system(paste(merge_logs,"&",merge_bams,"&",merge_txbams,"& wait"))
		system(paste0("rm ", map_tmp_dir, "tmp.", inp$project, ".*"))
		}else{
		STAR_command <- paste(STAR_command,
		"--readFilesIn",paste0(filtered_bams,collapse=","),
		"--outFileNamePrefix",paste(inp$out_dir,"/",inp$project,".filtered.tagged.",sep="")
		)
		if(inp$which_Stage == "Filtering"){
		system(paste(STAR_command,"&",sammerge_command,"& wait"))
		system(paste0("rm ",tmpfolder,"/",inp$project,".*"))
		}else{
		system(STAR_command)
		}
		}


		#clean up chunked bam files
		if(inp$which_Stage == "Filtering"){
		system(paste0("rm ",tmpfolder,"/",inp$project,".*"))
		}
		q()

zUMIs-master.sh→zUMIs.sh

+2 −2

Original line number	Diff line number	Diff line
		@@ -3,7 +3,7 @@
		# Pipeline to run UMI-seq analysis from fastq to read count tables.
		# Authors: Swati Parekh, Christoph Ziegenhain, Beate Vieth & Ines Hellmann
		# Contact: sparekh@age.mpg.de or christoph.ziegenhain@ki.se
		vers=2.8.3
		vers=2.9.0
		currentv=$(curl -s https://raw.githubusercontent.com/sdparekh/zUMIs/master/zUMIs-master.sh \| grep '^vers=' \| cut -f2 -d "=")
		if [ "$currentv" != "$vers" ] ; then
		echo -e "------------- \n\n Good news! A newer version of zUMIs is available at https://github.com/sdparekh/zUMIs \n\n-------------";
		@@ -75,7 +75,7 @@ check_opts "${yaml}" "YAML" "-y"

		# create temporary YAML file for corrected options
		yaml_orig=${yaml}
		yaml=$(dirname ${yaml})/$(basename ${yaml} .yaml).corrected.yaml
		yaml=$(dirname ${yaml})/$(basename ${yaml} .yaml).run.yaml
		cp ${yaml_orig} ${yaml}

		#now get some variables from YAML

Admin message