implement new Rsubread and hamming dists (f6364e11) · Commits · github_fork / ZUMIs

README.md

+3 −0

Original line number	Diff line number	Diff line
		@@ -12,6 +12,9 @@ You can glance through zUMIs in [zUMIs poster](https://github.com/sdparekh/zUMIs

		## Releases

		18 Feb 2018: zUMIs.0.0.3 released.
		Switched support to the new Rsubread version and data format. Furthermore to compensate sequencing/PCR errors, zUMIs now features UMI correction using Hamming distance and binning of adjacent cell barcodes.

		You can find the older version of zUMIs in [zUMIs.0.0.1](https://github.com/sdparekh/zUMIs/releases/tag/zUMIs.0.0.1).

		## Installation and Usage

zUMIs-counting.sh

+4 −2

Original line number	Diff line number	Diff line
		@@ -16,6 +16,8 @@ f3="${14}"
		xc2="${15}"
		nreads="${16}"
		Rexc="${17}"
		ham="${18}"
		XCbin="${19}"

		j=`cat $o/$sn.mapjobid.txt \| cut -f4 -d' '`
		u=`cat $o/$sn.unmapjobid.txt \| cut -f4 -d' '`
		@@ -52,9 +54,9 @@ echo "ln -s -f $o/$sn.aligned.sorted.bam $o/$sn.aligned.sorted.bam.ex" >>$o/$sn.

		if [[ "$whichStage" != "summarising" ]] ; then
		if [[ $n =~ $re ]] ; then
		echo "srun --chdir=$o $Rexc $d/zUMIs-dge.R --gtf $g --abam $o/$sn.aligned.sorted.bam --ubam $o/$sn.barcodelist.filtered.sort.sam --barcodenumber $n --out $o --sn $sn --cores $t --strandedness $s --bcstart $xcst --bcend $xcend --umistart $xmst --umiend $xmend --subsamp $subs --nReadsBC $nreads" >>$o/$sn.dge.sh
		echo "srun --chdir=$o $Rexc $d/zUMIs-dge.R --gtf $g --abam $o/$sn.aligned.sorted.bam --ubam $o/$sn.barcodelist.filtered.sort.sam --barcodenumber $n --out $o --sn $sn --cores $t --strandedness $s --bcstart $xcst --bcend $xcend --umistart $xmst --umiend $xmend --subsamp $subs --nReadsBC $nreads --hamming $ham --XCbin $XCbin" >>$o/$sn.dge.sh
		else
		echo "srun --chdir=$o $Rexc $d/zUMIs-dge.R --gtf $g --abam $o/$sn.aligned.sorted.bam --ubam $o/$sn.barcodelist.filtered.sort.sam --barcodefile $n --out $o --sn $sn --cores $t --strandedness $s --bcstart $xcst --bcend $xcend --umistart $xmst --umiend $xmend --subsamp $subs --nReadsBC $nreads" >>$o/$sn.dge.sh
		echo "srun --chdir=$o $Rexc $d/zUMIs-dge.R --gtf $g --abam $o/$sn.aligned.sorted.bam --ubam $o/$sn.barcodelist.filtered.sort.sam --barcodefile $n --out $o --sn $sn --cores $t --strandedness $s --bcstart $xcst --bcend $xcend --umistart $xmst --umiend $xmend --subsamp $subs --nReadsBC $nreads --hamming $ham --XCbin $XCbin" >>$o/$sn.dge.sh
		fi

		if [[ $stats == "yes" ]] ; then

zUMIs-dge.R

+152 −88

Original line number	Diff line number	Diff line
		@@ -34,7 +34,11 @@ option_list = list(
		make_option(c("--subsamp"), type="character", default="0",
		help="Number of reads for downsampling.", metavar="character"),
		make_option(c("--nReadsBC"), type="character", default="100",
		help="Retain cells with atleast N reads.", metavar="character")
		help="Retain cells with atleast N reads.", metavar="character"),
		make_option(c("--hamming"), type="character", default=0,
		help="Hamming distance filter", metavar="integer"),
		make_option(c("--XCbin"), type="character", default=0,
		help="Hamming distance of XC binning", metavar="integer")
		);

		opt_parser = OptionParser(option_list=option_list);
		@@ -42,46 +46,21 @@ opt = parse_args(opt_parser);

		print("I am loading useful packages...")
		print(Sys.time())
		packages <-c("multidplyr","dplyr","tidyr","reshape2","data.table","optparse","parallel","methods","GenomicRanges","GenomicFeatures","GenomicAlignments","AnnotationDbi","ggplot2","cowplot","tibble","mclust","Matrix")
		packages <-c("multidplyr","dplyr","tidyr","broom","stringdist","reshape2","data.table","optparse","parallel","methods","GenomicRanges","GenomicFeatures","GenomicAlignments","AnnotationDbi","ggplot2","cowplot","tibble","mclust","Matrix","Rsubread")
		paks<-lapply(packages, function(x) suppressMessages(require(x, character.only = TRUE)))
		rm(paks)

		#Check the version of Rsubread
		if(length(grep("Rsubread",installed.packages()))==1){
		bla<-data.frame(t(installed.packages()[grep("Rsubread",installed.packages()),c("LibPath","Version")]),row.names = NULL)
		}else{
		bla<-data.frame((installed.packages()[grep("Rsubread",installed.packages()),c("LibPath","Version")]),row.names = NULL)
		}


		if(length(grep("Rsubread",installed.packages()))==0){
		print("I did not find Rsubread so I am installing it...")
		install.packages("https://bioarchive.galaxyproject.org/Rsubread_1.26.0.tar.gz", repos = NULL, type = "source")
		bla<-data.frame(t(installed.packages()[grep("Rsubread",installed.packages()),c("LibPath","Version")]),row.names = NULL)
		library("Rsubread", lib.loc=as.character(bla[bla$Version %in% "1.26.0", "LibPath"]))

		}else{

		if(installed.packages()[grep("Rsubread",installed.packages()),"Version"] != "1.26.0"){
		print("I need Rsubread 1.26.0 so I am installing it...")
		install.packages("https://bioarchive.galaxyproject.org/Rsubread_1.26.0.tar.gz", repos = NULL, type = "source")
		if(length(grep("Rsubread",installed.packages()))==1){
		bla<-data.frame(t(installed.packages()[grep("Rsubread",installed.packages()),c("LibPath","Version")]),row.names = NULL)
		}else{
		bla<-data.frame((installed.packages()[grep("Rsubread",installed.packages()),c("LibPath","Version")]),row.names = NULL)
		}
		library("Rsubread", lib.loc=as.character(bla[bla$Version %in% "1.26.0", "LibPath"]))
		BiocInstaller::biocLite("Rsubread",dependencies = TRUE, ask = FALSE)
		}else{
		print("I am loading Rsubread 1.26.0...")
		if(length(grep("Rsubread",installed.packages()))==1){
		bla<-data.frame(t(installed.packages()[grep("Rsubread",installed.packages()),c("LibPath","Version")]),row.names = NULL)
		}else{
		bla<-data.frame((installed.packages()[grep("Rsubread",installed.packages()),c("LibPath","Version")]),row.names = NULL)
		}
		library("Rsubread", lib.loc=as.character(bla[bla$Version %in% "1.26.0", "LibPath"]))
		if(all(as.numeric_version(installed.packages()[grep("Rsubread",installed.packages()),"Version"])<'1.26.1')){
		print("I need newer Rsubread so I am updating it...")
		BiocInstaller::biocUpdatePackages("Rsubread", ask=FALSE)
		}

		}
		suppressMessages(require("Rsubread"))

		ncores=opt$cores
		bcstart=opt$bcstart
		@@ -92,7 +71,9 @@ stra=opt$strandedness
		subsampling=opt$subsamp
		sn=opt$sn
		out=opt$out
		HamDist=opt$hamming
		nReadsBC=opt$nReadsBC
		XCbin=opt$XCbin
		if(is.null(opt$barcodefile)==F){
		if(opt$barcodefile=="NA"){
		barcodes <- NA
		@@ -135,6 +116,8 @@ if(is.null(opt$gtf)==F){
		q()
		}



		setwd(dirname(abamfile))
		#################

		@@ -185,7 +168,6 @@ exon.saf<-data.frame(GeneID= names(gr.gene)[ol.ex],
		Strand = strand(unlist(exon)))

		saf <- list(introns=intron.saf,exons=exon.saf)
		ftype <- c("in","ex","inex")
		safout <- paste(out,"/zUMIs_output/expression/",sn,".annotationsSAF.rds",sep="")

		saveRDS(saf, file=safout)
		@@ -218,26 +200,48 @@ makeGEprofile <- function(abamfile,ubamfile,bcfile,safannot,ncores,stra,bcstart,
		return(rcfilt)
		}

		hammingFilter<-function(umiseq, edit=1){
		library(dplyr) #necessary for pipe to work within multidplyr workers
		# umiseq a vector of umis, one per read
		umiseq <- sort(umiseq)
		uc <- data.frame(us = umiseq) %>% dplyr::count(us) # normal UMI counts

		if(length(uc$us)>1 && length(uc$us)<10000){ #prevent use of > 100Gb RAM
		umi <- stringdist::stringdistmatrix(uc$us,method="hamming",useNames = "strings",nthread=1) %>% #only 1 core for each multidplyr worker
		broom::tidy() %>%
		dplyr::filter( distance <= edit ) %>% # only remove below chosen dist
		dplyr::left_join(uc, by = c( "item1" = "us")) %>%
		dplyr::left_join(uc, by = c( "item2" = "us"), suffix =c(".1",".2")) %>%
		dplyr::transmute( rem=if_else( n.1>=n.2, item2, item1 )) %>% #discard the UMI with fewer reads
		unique()
		if(nrow(umi)>0){
		uc <- uc[-match(umi$rem,uc$us),] #discard all filtered UMIs
		}
		}
		n <- nrow(uc)
		return(n)
		}

		if(ftype == "inex"){

		fctsfilein <- data.table::fread(paste("cut -f2,3 ",abamfile[1],".featureCounts",sep=""), sep="\t",quote='',header = F) #in
		fctsfileex <- data.table::fread(paste("cut -f2,3 ",abamfile[2],".featureCounts",sep=""), sep="\t",quote='',header = F) #ex
		fctsfilein <- data.table::fread(paste("cut -f2,4 ",abamfile[1],".featureCounts",sep=""), sep="\t",quote='',header = F) #in
		fctsfileex <- data.table::fread(paste("cut -f2,4 ",abamfile[2],".featureCounts",sep=""), sep="\t",quote='',header = F) #ex
		reads <- data.table::fread(paste("cut -f10 ",ubamfile,sep=""), quote='',header = F,skip=1)
		reads <- tibble::tibble(XC=substring(reads$V1, bcstart, bcend),XM=substring(reads$V1, umistart, umiend),GE=fctsfileex$V2,assignment=fctsfileex$V1,ftype="exon")
		fctsfilein$ftype<-"intron"
		reads[which(reads$GE==""),c("GE","assignment","ftype")] <- fctsfilein[which(reads$GE==""),c("V2","V1","ftype")]
		reads[which(is.na(reads$GE)),c("GE","assignment","ftype")] <- fctsfilein[which(is.na(reads$GE)),c("V2","V1","ftype")]
		reads$ftype <- ifelse(reads$assignment=="Assigned",reads$ftype,"inex")
		saveRDS(object = reads,file = paste(out,"/zUMIs_output/expression/",sn,".tbl.rds",sep=""))
		} else{

		fcts <- Rsubread::featureCounts(files=abamfile[1],annot.ext=safannot[[1]],isGTFAnnotationFile=F,primaryOnly=T,nthreads=1,reportReads=T,strandSpecific=stra)# do not use more than nthreads=1!
		fcts <- Rsubread::featureCounts(files=abamfile[2],annot.ext=safannot[[2]],isGTFAnnotationFile=F,primaryOnly=T,nthreads=1,reportReads=T,strandSpecific=stra)# do not use more than nthreads=1!
		fcts <- Rsubread::featureCounts(files=abamfile[1],annot.ext=safannot[[1]],isGTFAnnotationFile=F,primaryOnly=T,nthreads=1,reportReads="CORE",strandSpecific=stra)# do not use more than nthreads=1!
		fcts <- Rsubread::featureCounts(files=abamfile[2],annot.ext=safannot[[2]],isGTFAnnotationFile=F,primaryOnly=T,nthreads=1,reportReads="CORE",strandSpecific=stra)# do not use more than nthreads=1!

		fctsfile <- data.table::fread(paste("cut -f3 ",abamfile[2],".featureCounts",sep=""), sep="\t",quote='',header = F)
		fctsfile <- data.table::fread(paste("cut -f4 ",abamfile[2],".featureCounts",sep=""), sep="\t",quote='',header = F)
		reads <- data.table::fread(paste("cut -f10 ",ubamfile,sep=""), quote='',header = F,skip=1)

		reads <- tibble::tibble(XC=substring(reads$V1, bcstart, bcend),XM=substring(reads$V1, umistart, umiend),GE=fctsfile$V1)
		}


		if(is.na(bcfile)){
		fullstats <- reads %>% dplyr::group_by(XC) %>% dplyr::summarise(nreads=length(XM))
		@@ -249,11 +253,11 @@ makeGEprofile <- function(abamfile,ubamfile,bcfile,safannot,ncores,stra,bcstart,

		if(nrow(fullstats_detected)<10){
		print("Attention! Adaptive BC selection gave < 10 cells so I will now use top 100 cells!")
		bc <- reads %>% dplyr::group_by(XC) %>% dplyr::summarise(n=length(XM)) %>% dplyr::top_n(100) %>% dplyr::select(V1=XC)
		bc <<- reads %>% dplyr::group_by(XC) %>% dplyr::summarise(n=length(XM)) %>% dplyr::top_n(100) %>% dplyr::select(V1=XC)
		fullstats_detected<- fullstats[which(fullstats$XC %in% bc$V1),]
		}else{
		print(paste(nrow(fullstats_detected)," barcodes detected.",sep=""))
		bc <- data.frame(V1=fullstats_detected$XC)
		bc <<- data.frame(V1=fullstats_detected$XC)
		}

		#selected cells
		@@ -271,17 +275,55 @@ makeGEprofile <- function(abamfile,ubamfile,bcfile,safannot,ncores,stra,bcstart,
		}else{
		if(is.numeric(bcfile)){
		fullstats_detected <- reads %>% dplyr::group_by(XC) %>% dplyr::summarise(nreads=length(XM)) %>% dplyr::top_n(bcfile)
		bc <- data.frame(V1=fullstats_detected$XC)
		bc <<- data.frame(V1=fullstats_detected$XC)
		}else{
		bc <- read.table(bcfile,header = F,stringsAsFactors = F)
		bc <<- read.table(bcfile,header = F,stringsAsFactors = F)
		}
		}
		}
		## XC binning below
		if(XCbin != 0){
		print(paste("I am binning cell barcodes within hamming distance ",XCbin,sep=""))
		binmat <- stringdist::stringdistmatrix(bc$V1,unique(reads$XC),method="hamming",useNames = "strings",nthread=ncores)
		tmp <- reshape2::melt(binmat) %>% dplyr::mutate_if(is.factor, as.character) %>% dplyr::filter(value>0 & value <=XCbin)
		if(XCbin > 1){ #if there are conflicts we can choose the closer ones for dists >1
		dups <- tmp$Var2[duplicated(tmp$Var2)]
		for(i in dups){
		tmpdists<-tmp[which(tmp$Var2==i),"value"]
		if(min(tmpdists)==max(tmpdists)){ #if the dups are all with same dist
		tmp <- tmp[-which(tmp$Var2==i),] #..remove them
		}else{
		tmp <- tmp[-which(tmp$Var2==i & tmp$value>min(tmpdists)),] #keep only the minimal distance
		if(nrow(bla[which(tmp$Var2==i),])>1){ #if still more than one possibility
		tmp <- tmp[-which(tmp$Var2==i),] #...remove them
		}
		}
		}
		}else{
		dups <- tmp$Var2[duplicated(tmp$Var2)]
		binnable <- tmp$Var2[!(tmp$Var2 %in% dups)] #avoid conflicts
		}
		print(paste(length(binnable)," adjacent barcodes will be binned",sep=""))
		for(i in 1:nrow(binmat)){
		tobin<-names(which(binmat[i,]>0 & binmat[i,]<=XCbin))
		tobin<-tobin[which(tobin %in% binnable)]
		if(length(tobin)>0){
		reads[which(reads$XC %in% tobin),"XC"] <- row.names(binmat)[i]
		}
		}
		saveRDS(object = reads,file = paste(out,"/zUMIs_output/expression/",sn,".XCbinned.tbl.rds",sep=""))
		}
		## end XC binning

		#cluster <- create_cluster(ncores) # The clustering seems to have issue in partition when there is not enough data to spread on all the cores.
		#set_default_cluster(cluster)

		#umicounts <- reads %>% dplyr::filter((XC %in% bc$V1) & (GE!="*")) %>% partition(XC, cluster = cluster) %>% group_by(XC,GE) %>% summarise(umicount=length(unique(XM)),readcount=length(XM)) %>% collect()
		umicounts <- reads %>% dplyr::filter((XC %in% bc$V1) & (GE!="*")) %>% dplyr::group_by(XC,GE) %>% dplyr::summarise(umicount=length(unique(XM)),readcount=length(XM))
		if(HamDist==0){
		umicounts <- reads %>% dplyr::filter((XC %in% bc$V1) & (!is.na(GE))) %>% dplyr::group_by(XC,GE) %>% dplyr::summarise(umicount=length(unique(XM)),readcount=length(XM))
		}else{
		cluster <- create_cluster(ncores) # The clustering seems to have issue in partition when there is not enough data to spread on all the cores.
		set_default_cluster(cluster)
		cluster_copy(cluster, hammingFilter)
		cluster_copy(cluster, HamDist)
		umicounts <- reads %>% dplyr::filter((XC %in% bc$V1) & (!is.na(GE))) %>% multidplyr::partition(XC, cluster = cluster) %>% dplyr::group_by(XC,GE) %>% dplyr::summarise(umicount=hammingFilter(XM,edit = HamDist),readcount=length(XM)) %>% dplyr::collect()
		}

		if(subsampling!="0") {
		if(grepl(pattern = ",",x = subsampling)==TRUE){
		@@ -301,8 +343,17 @@ makeGEprofile <- function(abamfile,ubamfile,bcfile,safannot,ncores,stra,bcstart,

		if(as.logical((nrow(reads %>% dplyr::group_by(XC) %>% dplyr::summarise(n=length(XM)) %>% dplyr::filter(n>=subsampling_min))) >= 2)==TRUE){
		print(paste("I am subsampling to ",subsampling_iter,sep=""))
		tmp1 <- reads %>% dplyr::filter(XC %in% bc$V1) %>% dplyr::group_by(XC) %>% dplyr::filter(length(XC) > subsampling_max) %>% dplyr::sample_n(size = subsampling_max,replace=F)%>% dplyr::filter(GE!="*") %>% dplyr::group_by(XC,GE) %>% dplyr::summarise(umicount=length(unique(XM)),readcount=length(XM))
		tmp2 <- reads %>% dplyr::filter(XC %in% bc$V1) %>% dplyr::group_by(XC) %>% dplyr::filter((length(XC) < subsampling_max) & (length(XC) >= subsampling_min))%>% dplyr::filter(GE!="*") %>% dplyr::group_by(XC,GE) %>% dplyr::summarise(umicount=length(unique(XM)),readcount=length(XM))
		if(HamDist==0){
		tmp1 <- reads %>% dplyr::filter(XC %in% bc$V1) %>% dplyr::group_by(XC) %>% dplyr::filter(length(XC) > subsampling_max) %>% dplyr::sample_n(size = subsampling_max,replace=F)%>% dplyr::filter(!is.na(GE)) %>% dplyr::group_by(XC,GE) %>% dplyr::summarise(umicount=length(unique(XM)),readcount=length(XM))

		tmp2 <- reads %>% dplyr::filter(XC %in% bc$V1) %>% dplyr::group_by(XC) %>% dplyr::filter((length(XC) < subsampling_max) & (length(XC) >= subsampling_min))%>% dplyr::filter(!is.na(GE)) %>% dplyr::group_by(XC,GE) %>% dplyr::summarise(umicount=length(unique(XM)),readcount=length(XM))
		}else{
		tmp1 <- reads %>% dplyr::filter(XC %in% bc$V1) %>% dplyr::group_by(XC) %>% dplyr::filter(length(XC) > subsampling_max) %>% dplyr::sample_n(size = subsampling_max,replace=F)%>% dplyr::filter(!is.na(GE))
		if(nrow(tmp1)>0){
		tmp1 %>% multidplyr::partition(XC, cluster = cluster) %>% dplyr::group_by(XC,GE) %>% dplyr::summarise(umicount=hammingFilter(XM,edit = HamDist),readcount=length(XM)) %>% dplyr::collect()
		}
		tmp2 <- reads %>% dplyr::filter(XC %in% bc$V1) %>% dplyr::group_by(XC) %>% dplyr::filter((length(XC) < subsampling_max) & (length(XC) >= subsampling_min))%>% dplyr::filter(!is.na(GE)) %>% multidplyr::partition(XC, cluster = cluster) %>% dplyr::group_by(XC,GE) %>% dplyr::summarise(umicount=hammingFilter(XM,edit = HamDist),readcount=length(XM)) %>% dplyr::collect()
		}
		umicounts_sub <- dplyr::bind_rows(tmp1,tmp2)
		}else{
		print("Error! None of the barcodes has more than the requested minimal number of reads")
		@@ -311,8 +362,11 @@ makeGEprofile <- function(abamfile,ubamfile,bcfile,safannot,ncores,stra,bcstart,
		subsampling_no <- as.numeric(subsampling_iter)
		if(as.logical((nrow(reads %>% dplyr::group_by(XC) %>% dplyr::summarise(n=length(XM)) %>% dplyr::filter(n>=subsampling_no))) >= 2)==TRUE){
		print(paste("I am subsampling to ",subsampling_iter,sep=""))
		umicounts_sub <- reads %>% dplyr::filter(XC %in% bc$V1) %>% dplyr::group_by(XC) %>% dplyr::filter(length(XC) >= subsampling_no) %>% dplyr::sample_n(size = subsampling_no,replace=F)%>% dplyr::filter(GE!="*") %>% dplyr::group_by(XC,GE) %>% dplyr::summarise(umicount=length(unique(XM)),readcount=length(XM))

		if(HamDist==0){
		umicounts_sub <- reads %>% dplyr::filter(XC %in% bc$V1) %>% dplyr::group_by(XC) %>% dplyr::filter(length(XC) >= subsampling_no) %>% dplyr::sample_n(size = subsampling_no,replace=F)%>% dplyr::filter(!is.na(GE)) %>% dplyr::group_by(XC,GE) %>% dplyr::summarise(umicount=length(unique(XM)),readcount=length(XM))
		}else{
		umicounts_sub <- reads %>% dplyr::filter(XC %in% bc$V1) %>% dplyr::group_by(XC) %>% dplyr::filter(length(XC) >= subsampling_no) %>% dplyr::sample_n(size = subsampling_no,replace=F)%>% dplyr::filter(!is.na(GE)) %>% multidplyr::partition(XC, cluster = cluster) %>% dplyr::group_by(XC,GE) %>% dplyr::summarise(umicount=hammingFilter(XM,edit = HamDist),readcount=length(XM)) %>% dplyr::collect()
		}
		}else{
		print("Error! None of the barcodes has more than the requested number of reads")
		}
		@@ -349,8 +403,16 @@ makeGEprofile <- function(abamfile,ubamfile,bcfile,safannot,ncores,stra,bcstart,
		MAD_low <- round(MAD_low,digits = 0)

		print(paste("I am subsampling between ",MAD_low," and ",MAD_up," reads per barcode.",sep=""))
		tmp1 <- reads %>% dplyr::filter(XC %in% bcs_detected) %>% dplyr::group_by(XC) %>% dplyr::filter(length(XC) > MAD_up) %>% dplyr::sample_n(size = MAD_up,replace=F) %>% dplyr::filter(GE!="*") %>% dplyr::group_by(XC,GE) %>% dplyr::summarise(umicount=length(unique(XM)),readcount=length(XM))
		tmp2 <- reads %>% dplyr::filter(XC %in% bcs_detected) %>% dplyr::group_by(XC) %>% dplyr::filter((length(XC) >= MAD_low )& (length(XC) <= MAD_up)) %>% dplyr::filter(GE!="*") %>% dplyr::group_by(XC,GE) %>% dplyr::summarise(umicount=length(unique(XM)),readcount=length(XM))
		if(HamDist==0){
		tmp1 <- reads %>% dplyr::filter(XC %in% bcs_detected) %>% dplyr::group_by(XC) %>% dplyr::filter(length(XC) > MAD_up) %>% dplyr::sample_n(size = MAD_up,replace=F) %>% dplyr::filter(!is.na(GE)) %>% dplyr::group_by(XC,GE) %>% dplyr::summarise(umicount=length(unique(XM)),readcount=length(XM))
		tmp2 <- reads %>% dplyr::filter(XC %in% bcs_detected) %>% dplyr::group_by(XC) %>% dplyr::filter((length(XC) >= MAD_low )& (length(XC) <= MAD_up)) %>% dplyr::filter(!is.na(GE)) %>% dplyr::group_by(XC,GE) %>% dplyr::summarise(umicount=length(unique(XM)),readcount=length(XM))
		}else{
		tmp1 <- reads %>% dplyr::filter(XC %in% bc$V1) %>% dplyr::group_by(XC) %>% dplyr::filter(length(XC) > MAD_up) %>% dplyr::sample_n(size = MAD_up,replace=F)%>% dplyr::filter(!is.na(GE))
		if(nrow(tmp1)>0){
		tmp1 %>% multidplyr::partition(XC, cluster = cluster) %>% dplyr::group_by(XC,GE) %>% dplyr::summarise(umicount=hammingFilter(XM,edit = HamDist),readcount=length(XM)) %>% dplyr::collect()
		}
		tmp2 <- reads %>% dplyr::filter(XC %in% bcs_detected) %>% dplyr::group_by(XC) %>% dplyr::filter((length(XC) >= MAD_low )& (length(XC) <= MAD_up)) %>% dplyr::filter(!is.na(GE)) %>% multidplyr::partition(XC, cluster = cluster) %>% dplyr::group_by(XC,GE) %>% dplyr::summarise(umicount=hammingFilter(XM,edit = HamDist),readcount=length(XM)) %>% dplyr::collect()
		}
		umicounts_sub <- dplyr::bind_rows(tmp1,tmp2)

		downsampling_list <-list()
		@@ -384,6 +446,8 @@ makeGEprofile <- function(abamfile,ubamfile,bcfile,safannot,ncores,stra,bcstart,
		return(l)
		}

		ftype <- c("in","ex","inex")

		bams <- c(paste(abamfile,"in",sep="."),paste(abamfile,"ex",sep="."))

		AllCounts <-list()

zUMIs-mapping.sh

+1 −1

Original line number	Diff line number	Diff line
		@@ -33,6 +33,6 @@ echo '#SBATCH --dependency=afterok:'$j >>$o/$sn.map.sh
		echo '#SBATCH --mem='$m >>$o/$sn.map.sh

		echo "srun $starexc --genomeDir $g --runThreadN $t --readFilesCommand zcat --sjdbGTFfile $gtf --outFileNamePrefix $o/$sn. --outSAMtype BAM Unsorted --outSAMmultNmax 1 --outFilterMultimapNmax 50 --outSAMunmapped Within --sjdbOverhang $rl --twopassMode Basic --readFilesIn $o/$sn.cdnaread.filtered.fastq.gz $x" >>$o/$sn.map.sh
		echo "srun $samtoolsexc sort -n -O bam -T temp.$sn -@ $t -m 2G -o $o/$sn.aligned.sorted.bam $o/$sn.Aligned.out.bam" >>$o/$sn.map.sh
		echo "srun $samtoolsexc sort -n -O bam -T $o/temp.$sn -@ $t -m 2G -o $o/$sn.aligned.sorted.bam $o/$sn.Aligned.out.bam" >>$o/$sn.map.sh

		sbatch $o/$sn.map.sh > $o/$sn.mapjobid.txt

zUMIs-master.sh

+17 −9

File changed.

Preview size limit exceeded, changes collapsed.

Admin message