zUMIs2.8.0 update (80b041ad) · Commits · github_fork / ZUMIs

README.md

+2 −0

Original line number	Diff line number	Diff line
		@@ -29,6 +29,8 @@ We provide a script to convert zUMIs output into loom file automatically based o
		zUMIs will try to automatically do this, otherwise convert zUMIs output to loom by simply running `Rscript rds2loom.R myRun.yaml`.

		## Changelog
		02 May 2020: zUMIs2.8.0: Ensure correct behaviour when resuming from mapping (`which_stage: Mapping`). It is now possible to fractionally count reads overlapping multiple features (on the same strand if strand-specificity is used) `multi_overlap: yes`. New function for parsing the gene annotations: Fixed a problem with genes completely within other another gene's intron & added the possibility to extend genomic interval of exons (this is useful for crossmapping species with poor annotations). Added a probability calculation to compare the occurance of intronic reads to an expectation generated from background intergenic mapping reads (`intronProb: yes`). Added statistics output for the total number of reads observed per gene.

		22 Apr 2020: zUMIs2.7.3: zUMIs will try to parse a geneID to gene name mapping file from the user provided GTF annotations.

		27 Mar 2020: zUMIs2.7.2: New barcode handling functionalities: When using intersection of automatic BC detection and BC whitelist and the full barcode is composed out of several barcode pieces (eg. RT barcode + illumina barcode), the whitelist can now also just be corresponding to just one of the barcode pieces (eg. RT barcode only whitelist). Furthermore, some scRNA-seq protocols may have several cell barcodes that belong to the same cell (eg. SPLiT-seq with oligo-dT/random-hex round 1 barcode; i7 barcode mix in 10x Genomics). zUMIs now supports internally combing the counts via the `barcode_sharing:` option. Please look at the [wiki for further details](https://github.com/sdparekh/zUMIs/wiki/Barcodes#barcode-sharing-feature) and at [examples for some protocols](https://github.com/sdparekh/zUMIs/wiki/Protocol-specific-setup).

UMIstuffFUN.R

+128 −15

Original line number	Diff line number	Diff line
		@@ -102,7 +102,7 @@ prep_samtools <- function(featfile,bccount,inex,cores,samtoolsexc){
		return(outfiles)
		}

		reads2genes <- function(inex,chunkID){
		reads2genes <- function(inex,chunkID, keepUnassigned = FALSE){

		samfile <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".tmp.",chunkID,".txt")

		@@ -140,9 +140,14 @@ reads2genes <- function(inex,chunkID){

		setkey(reads,RG)

		if(keepUnassigned){
		return( reads )
		}else{
		return( reads[GE!="NA"] )
		}

		}

		hammingFilter<-function(umiseq, edit=1, gbcid=NULL){
		# umiseq a vector of umis, one per read
		uc <- data.table(us = umiseq)[, .N, by = "us"] # normal UMI counts
		@@ -241,11 +246,16 @@ ham_helper_fun <- function(x){
		}

		.makewide <- function(longdf,type){
		ge<-as.factor(longdf$GE)
		xc<-as.factor(longdf$RG)
		dt<-longdf[, list(GEu=unlist(strsplit(GE,","))), by=c("RG","GE",type)][ #this splits multioverlap gene lists by comma
		, cnt := get(type) / (stringr::str_count(GE,",")+1) ][
		, list(tot=sum(cnt)),by=c("RG","GEu") ]

		ge<-as.factor(dt$GEu)
		xc<-as.factor(dt$RG)

		widedf <- Matrix::sparseMatrix(i=as.integer(ge),
		j=as.integer(xc),
		x=as.numeric(unlist(longdf[,type,with=F])),
		x=as.numeric(unlist(dt[,"tot",with=F])),
		dimnames=list(levels(ge), levels(xc)))
		return(widedf)
		}
		@@ -388,7 +398,7 @@ demultiplex_bam <- function(opt, bamfile, nBCs){
		dir.create( paste0(opt$out_dir,"/zUMIs_output/demultiplexed/") )
		}

		installed_py <- system("pip freeze", intern = TRUE)
		installed_py <- try(system("pip freeze", intern = TRUE))

		if(any(grepl("pysam==",installed_py))){
		print("Using python implementation to demultiplex.")
		@@ -515,6 +525,26 @@ fixMissingOptions <- function(config){
		config$counting_opts$downsampling <- "0"
		}

		if(is.null(config$reference$exon_extension)){
		config$reference$exon_extension <- FALSE
		}

		if(is.null(config$reference$extension_length)){
		config$reference$extension_length <- 0
		}

		if(is.null(config$reference$scaffold_length_min)){
		config$reference$scaffold_length_min <- 0
		}

		if(is.null(config$counting_opts$multi_overlap)){
		config$counting_opts$multi_overlap <- FALSE
		}

		if(is.null(config$counting_opts$intronProb)){
		config$counting_opts$intronProb <- FALSE
		}

		return(config)
		}

		@@ -527,3 +557,86 @@ RPKM.calc <- function(exprmat, gene.length){
		y <- t(t(x)/lib.size)
		y/gene.length.kb
		}


		.intronProbability<-function(featfile,bccount,inex,cores,samtoolsexc,saf, allC){
		print("Fetching reads from bam files again to calculate intron scores...")

		nchunks <- length(unique(bccount$chunkID))
		all_rgfiles <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".RGgroup.",1:nchunks,".txt")

		for(i in unique(bccount$chunkID)){
		rgfile <- all_rgfiles[i]
		chunks <- bccount[chunkID==i]$XC
		write.table(file=rgfile,chunks,col.names = F,quote = F,row.names = F)
		}

		headerXX <- paste( c(paste0("V",1:3)) ,collapse="\t")
		write(headerXX,"freadHeader")

		headercommand <- "cat freadHeader > "
		layoutflag <- ifelse(opt$read_layout == "PE", "-f 0x0040", "")
		samcommand <- paste(samtoolsexc," view -x QB -x QU -x BX -x NH -x AS -x nM -x HI -x IH -x NM -x uT -x MD -x jM -x jI -x XN -x XS -x UX -x UB -x EN -x IN -x GE -x GI", layoutflag, "-@")

		outfiles <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".tmp.",1:nchunks,".txt")
		system(paste(headercommand,outfiles,collapse = "; "))

		cpusperchunk <- round(cores/nchunks,0)

		grepcommand <- " \| cut -f12,13,14 \| sed 's/BC:Z://' \| sed 's/ES:Z://g' \| sed 's/IS:Z://g' \| grep -F -f "
		inex_cmd <- paste(samcommand,cpusperchunk,featfile,grepcommand,all_rgfiles,">>",outfiles," & ",collapse = " ")

		system(paste(inex_cmd,"wait"))
		system("rm freadHeader")
		system(paste("rm",all_rgfiles))

		#continue with reading the data in
		for(i in unique(bccount$chunkID)){
		print(paste("Working on barcode chunk", i, "out of", length(unique(bccount$chunkID))))
		print(paste("Processing", length(bccount[chunkID == i]$XC), "barcodes in this chunk..."))
		samfile <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".tmp.",i,".txt")

		reads<-data.table::fread(samfile, na.strings=c(""),
		select=c(1,2,3),header=T,fill=T,colClasses = "character" , col.names = c("RG","ES","IS") )
		reads <- reads[ES == "Unassigned_NoFeatures" & IS == "Unassigned_NoFeatures"]
		system(paste("rm",samfile))

		scores_out<-.calculateProbabilityScores(reads = reads,
		saf = saf,
		bccount = bccount[chunkID == i],
		allC = allC)
		if(i == 1){
		scores <- scores_out
		}else{
		scores <- rbind(scores, scores_out)
		}
		}
		return (scores)
		}


		.calculateProbabilityScores<-function(reads,saf,bccount,allC){
		#count number of intergenic reads per BC; use only intronic mapped reads
		dt <-reads[,.(intergenicPerBC = .N), by = RG]

		#get count values
		tmp <- merge(x= allC$exon$all, y = allC$intron$all, by=c("GE","RG"),suffixes=c(".ex",".in"),all=T)
		tmp <- tmp[,c("GE","RG","readcount.ex","readcount.in")]
		tmp[is.na(tmp)]<-0
		#tmp[readcount.in ==0, readcount.in := 1] #??

		#get intronic bp per gene
		tmp<-merge(x=tmp,y=saf$intronsPerGene, by.x="GE", by.y="GeneID")


		dt<-merge(x=dt,y=tmp, by="RG", all.x=T)


		#lambda = (total number of intergenic reads in genome (per BC)/all intergenic bp in genome)*intronic bp per gene
		dt<-dt[, lambda:= (intergenicPerBC/saf$intergenicBp) * IntronLengthPerGene,][
		, prob:=ptpois(q=readcount.in ,lambda=lambda, lower.tail = F ) , by= c("RG","GE")]

		setorder(dt, GE, RG)

		return (dt[,c("GE","RG","prob"), with = FALSE])
		}

runfeatureCountFUN.R

+328 −49

File changed.

Preview size limit exceeded, changes collapsed.

statsFUN.R

+13 −0

Original line number	Diff line number	Diff line
		@@ -248,3 +248,16 @@ totReadBoxplot<-function(typeCount,fillcol){
		axis.title.x=element_blank())
		return(box)
		}

		sumGene <- function(counts){
		outlist <- lapply(names(counts$readcount), function(x){
		res <- Matrix::rowSums(counts$readcount[[x]]$all)
		out <- data.table(
		GeneID = names(res),
		nReads = as.numeric(res),
		ftype = x
		)
		})
		outdt <- rbindlist(outlist)
		return(outdt)
		}
		No newline at end of file

zUMIs-dge2.R

+42 −16

Original line number	Diff line number	Diff line
		@@ -50,28 +50,33 @@ bccount<-splitRG(bccount=bccount, mem= opt$mem_limit)
		##############################################################
		##### featureCounts

		abamfile<-paste0(opt$out_dir,"/",opt$project,".filtered.tagged.Aligned.out.bam")
		outbamfile <-paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.bam")

		## gene annotation
		saf<-.makeSAF(paste0(opt$out_dir,"/",opt$project,".final_annot.gtf"))
		saf<-.makeSAF(gtf = paste0(opt$out_dir,"/",opt$project,".final_annot.gtf"),
		extension_var = opt$reference$exon_extension,
		exon_extension = opt$reference$extension_length,
		buffer_length = (opt$reference$extension_length / 2),
		scaff_length = opt$reference$scaffold_length_min,
		multi_overlap_var = opt$counting_opts$multi_overlap)
		try(gene_name_mapping <- .get_gene_names(gtf = paste0(opt$out_dir,"/",opt$project,".final_annot.gtf"), threads = opt$num_threads), silent = TRUE)
		try(data.table::fwrite(gene_name_mapping, file = paste0(opt$out_dir,"/zUMIs_output/expression/",opt$project,".gene_names.txt"), sep ="\t", quote = FALSE), silent = TRUE)
		##

		abamfile<-paste0(opt$out_dir,"/",opt$project,".filtered.tagged.Aligned.out.bam")
		outbamfile <-paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.bam")

		if(smart3_flag & opt$counting_opts$strand == 1){
		#split bam in UMU ends and internal
		tmp_bams <- split_bam(bam = abamfile, cpu = opt$num_threads, samtoolsexc=samtoolsexc)

		#assign features with appropriate strand
		fnex_int<-.runFeatureCount(tmp_bams[1], saf=saf$exons, strand=0, type="ex", primaryOnly = opt$counting_opts$primaryHit, cpu = opt$num_threads, mem = opt$mem_limit, fcounts_clib = fcounts_clib)
		fnex_umi<-.runFeatureCount(tmp_bams[2], saf=saf$exons, strand=1, type="ex", primaryOnly = opt$counting_opts$primaryHit, cpu = opt$num_threads, mem = opt$mem_limit, fcounts_clib = fcounts_clib)
		fnex_int<-.runFeatureCount(tmp_bams[1], saf=saf$exons, strand=0, type="ex", primaryOnly = opt$counting_opts$primaryHit, cpu = opt$num_threads, mem = opt$mem_limit, fcounts_clib = fcounts_clib, multi_overlap_var = opt$counting_opts$multi_overlap)
		fnex_umi<-.runFeatureCount(tmp_bams[2], saf=saf$exons, strand=1, type="ex", primaryOnly = opt$counting_opts$primaryHit, cpu = opt$num_threads, mem = opt$mem_limit, fcounts_clib = fcounts_clib, multi_overlap_var = opt$counting_opts$multi_overlap)
		ffiles_int <- paste0(fnex_int,".tmp")
		ffiles_umi <- paste0(fnex_umi,".tmp")

		if(opt$counting_opts$introns){
		fnin_int<-.runFeatureCount(ffiles_int, saf=saf$introns, strand=0, type="in", primaryOnly = opt$counting_opts$primaryHit, cpu = opt$num_threads, mem = opt$mem_limit, fcounts_clib = fcounts_clib)
		fnin_umi<-.runFeatureCount(ffiles_umi, saf=saf$introns, strand=1, type="in", primaryOnly = opt$counting_opts$primaryHit, cpu = opt$num_threads, mem = opt$mem_limit, fcounts_clib = fcounts_clib)
		fnin_int<-.runFeatureCount(ffiles_int, saf=saf$introns, strand=0, type="in", primaryOnly = opt$counting_opts$primaryHit, cpu = opt$num_threads, mem = opt$mem_limit, fcounts_clib = fcounts_clib, multi_overlap_var = opt$counting_opts$multi_overlap)
		fnin_umi<-.runFeatureCount(ffiles_umi, saf=saf$introns, strand=1, type="in", primaryOnly = opt$counting_opts$primaryHit, cpu = opt$num_threads, mem = opt$mem_limit, fcounts_clib = fcounts_clib, multi_overlap_var = opt$counting_opts$multi_overlap)
		ffiles_int <- paste0(fnin_int,".tmp")
		ffiles_umi <- paste0(fnin_umi,".tmp")
		}
		@@ -86,7 +91,8 @@ if(smart3_flag & opt$counting_opts$strand == 1){
		primaryOnly = opt$counting_opts$primaryHit,
		cpu = opt$num_threads,
		mem = opt$mem_limit,
		fcounts_clib = fcounts_clib)
		fcounts_clib = fcounts_clib,
		multi_overlap_var = opt$counting_opts$multi_overlap)
		ffiles<-paste0(fnex,".tmp")

		if(opt$counting_opts$introns){
		@@ -97,7 +103,8 @@ if(smart3_flag & opt$counting_opts$strand == 1){
		primaryOnly = opt$counting_opts$primaryHit,
		cpu = opt$num_threads,
		mem = opt$mem_limit,
		fcounts_clib = fcounts_clib)
		fcounts_clib = fcounts_clib,
		multi_overlap_var = opt$counting_opts$multi_overlap)
		system(paste0("rm ",fnex,".tmp"))
		ffiles<-paste0(fnin,".tmp")
		}
		@@ -223,9 +230,28 @@ if(UMIcheck == "nonUMI" \| smart3_flag == TRUE ){

		saveRDS(final,file=paste(opt$out_dir,"/zUMIs_output/expression/",opt$project,".dgecounts.rds",sep=""))

		#################
		#Accessory processing scripts
		################# #Accessory processing scripts
		#Intron Probability Score Calculation
		if(opt$counting_opts$intronProb == TRUE){
		if(opt$counting_opts$introns == FALSE){
		print("Intron information is needed to calculate Intron-Probability! Please change yaml settings counting_opts->introns to yes.\n Skipping probability calculation...")
		}else{
		library(extraDistr)
		print("Starting Intron-Probability Calculations...")

		# Extractingreads from sam files again, this could be sped up by integrating code further upstream of zUMIs-dge2
		genesWithIntronProb<-.intronProbability(bccount=bccount,
		featfile=sortbamfile,
		inex=opt$counting_opts$introns,
		cores=opt$num_threads,
		samtoolsexc=samtoolsexc,
		allC=allC,
		saf=saf)
		saveRDS(genesWithIntronProb, file = paste0(opt$out_dir,"/zUMIs_output/stats/",opt$project,".intronProbability.rds"))
		}
		}

		#demultiplexing
		if(opt$counting_opts$Ham_Dist == 0 && opt$barcodes$demultiplex == TRUE ){ #otherwise its already demultiplexed!
		print("Demultiplexing output bam file by cell barcode...")
		demultiplex_bam(opt, sortbamfile, nBCs = length(unique(bccount$XC)))

Admin message