removed extra file (b9ee5206) · Commits · github_fork / ZUMIs

zUMIs-dge_newdownsamplinglists.R

deleted100755 → 0

+0 −403

Original line number	Diff line number	Diff line
		#!/usr/bin/env Rscript

		print("I am loading useful packages...")
		print(Sys.time())
		packages <-c("multidplyr","dplyr","tidyr","reshape2","data.table","optparse","parallel","Rsubread","methods","GenomicRanges","GenomicFeatures","GenomicAlignments","AnnotationDbi","ggplot2","cowplot","tibble")
		paks<-lapply(packages, function(x) suppressMessages(require(x, character.only = TRUE)))
		rm(paks)

		# optparse ---------------------------------------------------------------

		option_list = list(
		make_option(c("--gtf"), type="character", default=NULL,
		help="GTF file", metavar="character"),
		make_option(c("--abam"), type="character", default=NULL,
		help="STAR Aligned bam file", metavar="character"),
		make_option(c("--ubam"), type="character", default=NULL,
		help="XC/XM read unaligned sam file", metavar="character"),
		make_option(c("--barcodefile"), type="character", default=NULL,
		help="A file with a list of cell barcodes without a header", metavar="character"),
		make_option(c("--barcodenumber"), type="integer", default=NULL,
		help="number of highest barcodes to take", metavar="integer"),
		make_option(c("--out"), type="character", default=NULL,
		help="output directory", metavar="character"),
		make_option(c("--sn"), type="character", default="study",
		help="Study name", metavar="character"),
		make_option(c("--cores"), type="integer", default=10,
		help="Number of threads", metavar="integer"),
		make_option(c("--strandedness"), type="logical", default=TRUE,
		help="Is your RNA-seq library stranded?", metavar="logical"),
		make_option(c("--bcstart"), type="integer", default=1,
		help="Start position of cell barcode in the read", metavar="integer"),
		make_option(c("--bcend"), type="integer", default=6,
		help="End position of cell barcode in the read", metavar="integer"),
		make_option(c("--umistart"), type="integer", default=7,
		help="Start position of UMI in the read", metavar="integer"),
		make_option(c("--umiend"), type="integer", default=16,
		help="End position of UMI barcode in the read", metavar="integer"),
		make_option(c("--subsamp"), type="character", default="0",
		help="Number of reads for downsampling.", metavar="character")
		);

		opt_parser = OptionParser(option_list=option_list);
		opt = parse_args(opt_parser);

		ncores=opt$cores
		bcstart=opt$bcstart
		bcend=opt$bcend
		umistart=opt$umistart
		umiend=opt$umiend
		stra=opt$strandedness
		subsampling=opt$subsamp
		sn=opt$sn
		out=opt$out

		if(is.null(opt$barcodefile)==F){
		if(opt$barcodefile=="NA"){
		barcodes <- NA
		}else if(file.exists(opt$barcodefile)){
		barcodes<-opt$barcodefile
		}else{
		print("The barcodes file does not exist!")
		q()
		}
		}else if( is.null(opt$barcodenumber)==F){
		barcodes<-opt$barcodenumber
		}else if(is.null(opt$barcodenumber)==F & is.null(opt$barcodefile)==F){
		barcodes<-opt$barcodefile
		}else if(is.null(opt$barcodenumber)==T & is.null(opt$barcodefile)==T){
		print("Provide either barcodes file or number of barcodes to choose.")
		q()
		}else{
		print("Provide either barcodes file or number of barcodes to choose.")
		q()
		}

		if(is.null(opt$abam)==F){
		abamfile <- opt$abam
		}else{
		print("Provide an aligned bam file with unmapped reads within.")
		q()
		}

		if(is.null(opt$ubam)==F){
		ubamfile <- opt$ubam
		}else{
		print("Provide an unalinged SAM file of barcode reads.")
		q()
		}

		if(is.null(opt$gtf)==F){
		gtf <- opt$gtf
		}else{
		print("Provide a GTF file please.")
		q()
		}

		setwd(dirname(abamfile))
		#################

		# make SAF of intron/exon/intron&exon -------------------------------------

		print("I am making annotations in SAF... This will take less than 3 minutes...")
		print(Sys.time())

		txdb <- makeTxDbFromGFF(file=gtf, format="gtf")

		## Make Gene-range GR-object
		se <- AnnotationDbi::select(txdb, keys(txdb, "GENEID"),
		columns=c("GENEID","TXCHROM","TXSTART","TXEND","TXSTRAND"),
		keytype="GENEID") %>%
		dplyr::group_by(GENEID,TXCHROM,TXSTRAND) %>%
		dplyr::mutate( txstart =ifelse(TXSTART<TXEND,min(TXSTART),min(TXEND)),
		txend =ifelse(TXSTART<TXEND,max(TXEND),min(TXSTART) ) ) %>%
		dplyr::select(GENEID,TXCHROM,TXSTRAND,txstart,txend) %>% unique()


		gr.gene<-GRanges(seqnames = se$TXCHROM,
		ranges = IRanges(start= se$txstart,
		end= se$txend,
		names=se$GENEID),
		strand = se$TXSTRAND,
		gid = se$GENEID)

		### Get non-overlapping Introns/Exons
		intron<-intronsByTranscript(txdb, use.names=T)
		exon<-exonsBy(txdb, by="tx",use.names=T)

		intron.exon.red <- c( GenomicRanges::reduce(unlist(intron),ignore.strand=T), GenomicRanges::reduce(unlist(exon),ignore.strand=T) )
		intron.exon.dis <- GenomicRanges::disjoin(intron.exon.red, ignore.strand=T)
		intron.only<-GenomicRanges::setdiff(intron.exon.dis, unlist(exon) ,ignore.strand=T)

		ol.in<-GenomicRanges::findOverlaps(intron.only, gr.gene, select="arbitrary")
		ol.ex<-GenomicRanges::findOverlaps(unlist(exon), gr.gene, select="arbitrary")

		intron.saf<-data.frame(GeneID= names(gr.gene)[ol.in],
		Chr = seqnames(intron.only),
		Start = start(intron.only),
		End = end(intron.only),
		Strand = strand(intron.only))
		exon.saf<-data.frame(GeneID= names(gr.gene)[ol.ex],
		Chr = seqnames(unlist(exon)),
		Start = start(unlist(exon)),
		End = end(unlist(exon)),
		Strand = strand(unlist(exon)))

		saf <- list(introns=intron.saf,exons=exon.saf)
		ftype <- c("in","ex","inex")
		safout <- paste(out,"/zUMIs_output/expression/",sn,".annotationsSAF.rds",sep="")

		saveRDS(saf, file=safout)
		rm(se,gr.gene,intron,exon,intron.exon.red,intron.exon.dis,intron.only,ol.ex,ol.in,intron.saf,exon.saf)
		#################

		print("I am making count tables...This will take a while!!")
		print(Sys.time())


		# make umi and read count tables ------------------------------------------
		makeGEprofile <- function(abamfile,ubamfile,bcfile,safannot,ncores,stra,bcstart,bcend,umistart,umiend,subsampling,ftype,sn,out){
		makewide <- function(longdf,type){
		widedf <- longdf %>% dplyr::select_("XC","GE",type) %>% tidyr::spread_(.,key = "XC",value = type,fill = 0,drop = T)
		widedf <- as.data.frame(widedf)
		row.names(widedf) <- widedf$GE
		widedf <- widedf[,-1]
		return(widedf)
		}

		if(ftype=="in"){
		abamfile <- abamfile[1]
		}
		if(ftype=="ex"){
		abamfile <- abamfile[2]
		}

		packages <-c("multidplyr","dplyr","tidyr","reshape2","data.table","Rsubread","methods")
		bla<-lapply(packages, function(x) suppressMessages(require(x, character.only = TRUE)))
		rm(bla)
		if(ftype == "inex"){

		fctsfilein <- fread(paste("cut -f2,3 ",abamfile[1],".featureCounts",sep=""), sep="\t",quote='',header = F) #in
		fctsfileex <- fread(paste("cut -f2,3 ",abamfile[2],".featureCounts",sep=""), sep="\t",quote='',header = F) #ex
		reads <- fread(paste("cut -f10 ",ubamfile,sep=""), quote='',header = F,skip=1)
		reads <- tibble::tibble(XC=substring(reads$V1, bcstart, bcend),XM=substring(reads$V1, umistart, umiend),GE=fctsfileex$V2,assignment=fctsfileex$V1,ftype="exon")
		fctsfilein$ftype<-"intron"
		reads[which(reads$GE==""),c("GE","assignment","ftype")] <- fctsfilein[which(reads$GE==""),c("V2","V1","ftype")]
		reads$ftype <- ifelse(reads$assignment=="Assigned",reads$ftype,"inex")
		saveRDS(object = reads,file = paste(out,"/zUMIs_output/expression/",sn,".tbl.rds",sep=""))
		} else{

		fcts <- featureCounts(files=abamfile,annot.ext=safannot,isGTFAnnotationFile=F,primaryOnly=T,nthreads=1,reportReads=T,strandSpecific=stra)# do not use more than nthreads=1!

		fctsfile <- fread(paste("cut -f3 ",abamfile,".featureCounts",sep=""), sep="\t",quote='',header = F)
		reads <- fread(paste("cut -f10 ",ubamfile,sep=""), quote='',header = F,skip=1)

		reads <- tibble(XC=substring(reads$V1, bcstart, bcend),XM=substring(reads$V1, umistart, umiend),GE=fctsfile$V1)
		}
		if(is.numeric(bcfile)){
		bc <- reads %>% group_by(XC) %>% dplyr::summarise(n=length(XM)) %>% top_n(bcfile) %>% dplyr::select(V1=XC)
		}else if(is.na(bcfile)){
		fullstats <- reads %>% group_by(XC) %>% summarise(nreads=length(XM))
		fullstats <- fullstats[order(fullstats$nreads,decreasing = T),]
		fullstats$cs <- cumsum(fullstats$nreads)
		d <- density(log10(fullstats[which(fullstats$cs<0.99*sum(fullstats$nreads)),]$nreads), bw = 0.5) #taking only things that account for 99% of reads removes most crap barcodes
		tp<-pastecs::turnpoints(ts(d$y))#find turning points
		cutoff_tp <- which(d$y[tp$tp]==max(d$y[tp$tp]))+1 # select first local minimum after the major peak
		minreads_cutoff <- 10^(d$x[tp$tppos[cutoff_tp]])

		fullstats_detected <- fullstats[which(fullstats$nreads >= minreads_cutoff),]
		fullstats_detected <- as.data.frame(fullstats_detected)

		bc <- fullstats_detected$XC
		}else{
		bc <- read.table(bcfile,header = F,stringsAsFactors = F)
		}

		cluster <- create_cluster(ncores)
		set_default_cluster(cluster)

		umicounts <- reads %>% dplyr::filter((XC %in% bc$V1) & (GE!="*")) %>% partition(XC, cluster = cluster) %>% group_by(XC,GE) %>% summarise(umicount=length(unique(XM)),readcount=length(XM)) %>% collect()

		if(subsampling!="0") {
		if(grepl(pattern = ",",x = subsampling)==TRUE){
		tmpsplit <- strsplit(x = subsampling,split = ",")[[1]]
		ndepths <- length(tmpsplit)

		}else{
		ndepths <- 1
		tmpsplit <- subsampling
		}
		downsampling_list <-list()
		for(i in 1:ndepths){
		subsampling_iter <- tmpsplit[i]
		if(grepl(pattern = "-",x = subsampling_iter)==TRUE){
		subsampling_min <- as.numeric(strsplit(subsampling_iter,"-")[[1]][1])
		subsampling_max <- as.numeric(strsplit(subsampling_iter,"-")[[1]][2])

		if(as.logical((reads %>% group_by(XC) %>% dplyr::summarise(n=length(XM)) %>% top_n(1))[,2] >= subsampling_min)==TRUE){
		print(paste("I am subsampling to ",subsampling_iter,sep=""))
		tmp1 <- reads %>% dplyr::filter(XC %in% bc$V1) %>% group_by(XC) %>% filter(length(XC) > subsampling_max) %>% dplyr::sample_n(size = subsampling_max,replace=F)%>% dplyr::filter(GE!="*") %>% group_by(XC,GE) %>% summarise(umicount=length(unique(XM)),readcount=length(XM))
		tmp2 <- reads %>% dplyr::filter(XC %in% bc$V1) %>% group_by(XC) %>% filter((length(XC) < subsampling_max) & (length(XC) >= subsampling_min))%>% dplyr::filter(GE!="*") %>% group_by(XC,GE) %>% summarise(umicount=length(unique(XM)),readcount=length(XM))
		umicounts_sub <- bind_rows(tmp1,tmp2)
		}else{
		print("Error! None of the barcodes has more than the requested minimal number of reads")
		}
		}else{
		subsampling_no <- as.numeric(subsampling_iter)
		if(as.logical((reads %>% group_by(XC) %>% dplyr::summarise(n=length(XM)) %>% top_n(1))[,2] >= subsampling_no)==TRUE){
		print(paste("I am subsampling to ",subsampling_iter,sep=""))
		umicounts_sub <- reads %>% dplyr::filter(XC %in% bc$V1) %>% group_by(XC) %>% filter(length(XC) >= subsampling_no) %>% dplyr::sample_n(size = subsampling_no,replace=F)%>% dplyr::filter(GE!="*") %>% group_by(XC,GE) %>% summarise(umicount=length(unique(XM)),readcount=length(XM))

		}else{
		print("Error! None of the barcodes has more than the requested number of reads")
		}
		}
		umicounts_sub_wide <- makewide(umicounts_sub,"umicount")
		readcounts_sub_wide <- makewide(umicounts_sub,"readcount")
		iterlist <- list(readcounts_sub_wide,umicounts_sub_wide)
		names(iterlist) <-c("readcounts_downsampled","umicounts_downsampled")
		downsampling_list[[i]] <- iterlist

		}
		if(ndepths==1){
		names(downsampling_list) <- paste("downsampled",subsampling,sep="_")
		}else{
		names(downsampling_list) <- paste("downsampled",tmpsplit,sep="_")
		}
		}else{
		fullstats <- reads %>% group_by(XC) %>% summarise(nreads=length(XM))
		fullstats <- fullstats[order(fullstats$nreads,decreasing = T),]
		fullstats$cs <- cumsum(fullstats$nreads)


		if(is.numeric(bcfile) \| is.na(bcfile)){
		d <- density(log10(fullstats[which(fullstats$cs<0.99*sum(fullstats$nreads)),]$nreads), bw = 0.5) #taking only things that account for 99% of reads removes most crap barcodes
		tp<-pastecs::turnpoints(ts(d$y))#find turning points
		cutoff_tp <- which(d$y[tp$tp]==max(d$y[tp$tp]))+1 # select first local minimum after the major peak
		minreads_cutoff <- 10^(d$x[tp$tppos[cutoff_tp]])

		fullstats_detected <- fullstats[which(fullstats$nreads >= minreads_cutoff),]
		fullstats_detected <- as.data.frame(fullstats_detected)

		bcs_detected <- fullstats_detected$XC
		}else{ #if people give a barcode annotation, only those detected barcodes that are also annotated should be selected
		bcs_detected <- bc$V1
		fullstats_detected<- fullstats[which(fullstats$XC %in% bc$V1),]
		}

		medianreads <- round(median(fullstats_detected$nreads),digits = 0)
		MAD_up <- 10^(log10(medianreads) + 3*median(abs(log10(fullstats_detected$nreads)-median(log10(fullstats_detected$nreads)))))
		MAD_low <- 10^(log10(medianreads) - 3*median(abs(log10(fullstats_detected$nreads)-median(log10(fullstats_detected$nreads)))))
		#check that low is not under 0

		print(paste("I detected ",length(bcs_detected)," cells and am subsampling to ",medianreads," reads",sep=""))
		tmp1 <- reads %>% dplyr::filter(XC %in% bcs_detected) %>% group_by(XC) %>% filter(length(XC) > MAD_up) %>% dplyr::sample_n(size = MAD_up,replace=F) %>% dplyr::filter(GE!="*") %>% group_by(XC,GE) %>% summarise(umicount=length(unique(XM)),readcount=length(XM))
		tmp2 <- reads %>% dplyr::filter(XC %in% bcs_detected) %>% group_by(XC) %>% filter((length(XC) >= MAD_low )& (length(XC) <= MAD_up)) %>% dplyr::filter(GE!="*") %>% group_by(XC,GE) %>% summarise(umicount=length(unique(XM)),readcount=length(XM))
		umicounts_sub <- bind_rows(tmp1,tmp2)

		downsampling_list <-list()
		umicounts_sub_wide <- makewide(umicounts_sub,"umicount")
		readcounts_sub_wide <- makewide(umicounts_sub,"readcount")
		iterlist <- list(readcounts_sub_wide,umicounts_sub_wide)
		names(iterlist) <-c("readcounts_downsampled","umicounts_downsampled")
		downsampling_list[[1]] <- iterlist
		names(downsampling_list) <- paste("downsampled",medianreads,sep="_")

		#make some plots for people

		#selected cells
		pdf(file=paste(out,"/zUMIs_output/stats/",sn,".detected_cells.pdf",sep=""))
		plot(cumsum(fullstats$nreads),xlab="Cell Index", ylab="Cumulative number of reads")
		points(cumsum(fullstats_detected$nreads),col="red")
		dev.off()

		#downsampling
		#check if ranges include MAD max
		pdf(file=paste(out,"/zUMIs_output/stats/",sn,".downsampling_thresholds.pdf",sep=""))
		barplot(fullstats_detected$nreads,ylab="Number of reads",xlab="Cell Barcodes",ylim = c(0,1.1*max(c(fullstats_detected$nreads,MAD_up))))
		abline(h=MAD_low,col="red")
		abline(h=MAD_up,col="red")
		dev.off()
		}

		umicounts_wide <- makewide(umicounts,"umicount")

		readcounts_wide <- makewide(umicounts,"readcount")


		l <- list(readcounts_wide,umicounts_wide,downsampling_list)
		names(l) <- c("readcounts","umicounts","downsampled")


		rm(reads,readcounts_wide,umicounts,umicounts_wide)

		return(l)
		}

		#outsuffix <- paste(out,"/zUMIs_output/",sn,".DGE.log",sep="")

		#cl <- makeCluster(getOption("cl.cores", 2),outfile=outsuffix)

		bams <- c(paste(abamfile,"in",sep="."),paste(abamfile,"ex",sep="."))

		#clusterExport(cl, c("makeGEprofile", "saf", "bams", "ubamfile", "barcodes", "ncores","stra","bcstart","bcend","umistart","umiend","subsampling","ftype","sn","out") )

		#AllCounts<-parLapply(cl, 1:length(saf) ,function(i){ makeGEprofile(bams,ubamfile,barcodes,saf[[i]],ncores,stra,bcstart,bcend,umistart,umiend,subsampling,ftype[i],sn,out) })

		AllCounts<-lapply(1:length(saf) ,function(i){ makeGEprofile(bams,ubamfile,barcodes,saf[[i]],ncores,stra,bcstart,bcend,umistart,umiend,subsampling,ftype[i],sn,out) })

		#stopCluster(cl)
		names(AllCounts)<-names(saf)

		AllCounts$intron.exon <- makeGEprofile(bams,ubamfile,barcodes,saf[[i]],ncores,stra,bcstart,bcend,umistart,umiend,subsampling,ftype[3],sn,out)


		intronunique <- function(intronexondf,exondf){
		ex_in_gene_intersect <- intersect(row.names(intronexondf),row.names(exondf))
		ex_in_cell_intersect <- intersect(colnames(intronexondf),colnames(exondf))

		uniquein <- rbind(intronexondf[which(!(row.names(intronexondf) %in% row.names(exondf))),ex_in_cell_intersect],
		(intronexondf[ex_in_gene_intersect,ex_in_cell_intersect] - exondf[ex_in_gene_intersect,ex_in_cell_intersect])
		)
		uniquein <- uniquein[which(rowSums(uniquein)>0),]
		return(uniquein)
		}

		AllCounts$introns$umicounts <- intronunique(AllCounts$intron.exon$umicounts,AllCounts$exons$umicounts)
		AllCounts$introns$readcounts <- intronunique(AllCounts$intron.exon$readcounts,AllCounts$exons$readcounts)

		tmpintersect <- intersect(row.names(AllCounts$introns$umicounts),row.names(AllCounts$introns$readcounts))
		AllCounts$introns$umicounts <- AllCounts$introns$umicounts[tmpintersect,]
		AllCounts$introns$readcounts <- AllCounts$introns$readcounts[tmpintersect,]
		rm(tmpintersect)

		if(subsampling!= "0") {
		print("I am making intronunique...")
		if(grepl(pattern = ",",x = subsampling)==TRUE){
		tmpsplit <- strsplit(x = subsampling,split = ",")[[1]]
		ndepths <- length(tmpsplit)

		}else{
		ndepths <- 1
		}
		for(n in ndepths){
		AllCounts$introns$downsampled[[n]]$umicounts_downsampled <- intronunique(AllCounts$intron.exon$downsampled[[n]]$umicounts_downsampled,AllCounts$exons$downsampled[[n]]$umicounts_downsampled)

		AllCounts$introns$downsampled[[n]]$readcounts_downsampled <- intronunique(AllCounts$intron.exon$downsampled[[n]]$readcounts_downsampled,AllCounts$exons$downsampled[[n]]$readcounts_downsampled)

		tmpintersect <- intersect(row.names(AllCounts$introns$downsampled[[n]]$umicounts_downsampled),row.names(AllCounts$introns$downsampled[[n]]$readcounts_downsampled))
		AllCounts$introns$downsampled[[n]]$umicounts_downsampled <- AllCounts$introns$downsampled[[n]]$umicounts_downsampled[tmpintersect,]
		AllCounts$introns$downsampled[[n]]$readcounts_downsampled <- AllCounts$introns$downsampled[[n]]$readcounts_downsampled[tmpintersect,]
		rm(tmpintersect)
		}
		}

		saveRDS(AllCounts,file=paste(out,"/zUMIs_output/expression/",sn,".dgecounts.rds",sep=""))
		#lapply(names(AllCounts),function(x) lapply(names(AllCounts[[x]]), function(xx) write.table(AllCounts[[x]][xx],file=paste(out,"/zUMIs_output/expression/",sn,xx,".",x,".txt",sep=""),sep = "\t",row.names = T,col.names = T)))

		#################

		print(Sys.time())
		print(paste("I am done!! Look what I produced...",out,"/zUMIs_output/",sep=""))
		print(gc())
		q()

Admin message