optimize hamming distance functionality (afa0322b) · Commits · github_fork / ZUMIs

zUMIs-dge.R

+78 −42

Original line number	Diff line number	Diff line
		@@ -46,7 +46,7 @@ opt = parse_args(opt_parser);

		print("I am loading useful packages...")
		print(Sys.time())
		packages <-c("multidplyr","dplyr","tidyr","broom","stringdist","reshape2","data.table","optparse","parallel","methods","GenomicRanges","GenomicFeatures","GenomicAlignments","AnnotationDbi","ggplot2","cowplot","tibble","mclust","Matrix","Rsubread")
		packages <-c("multidplyr","dplyr","tidyr","broom","reshape2","data.table","optparse","parallel","methods","GenomicRanges","GenomicFeatures","GenomicAlignments","AnnotationDbi","ggplot2","cowplot","tibble","mclust","Matrix","Rsubread")
		paks<-lapply(packages, function(x) suppressMessages(require(x, character.only = TRUE)))
		rm(paks)

		@@ -201,22 +201,50 @@ makeGEprofile <- function(abamfile,ubamfile,bcfile,safannot,ncores,stra,bcstart,
		return(rcfilt)
		}

		ham_twomats <- function(barcodes,XCstrings) {
		barcodes<-as.character(barcodes) #make sure this is a character, not a factor
		X<- matrix(unlist(strsplit(barcodes, "")),ncol = length(barcodes))
		Y<- matrix(unlist(strsplit(XCstrings, "")),ncol = length(XCstrings))

		#function below thanks to Johann de Jong
		#https://goo.gl/u8RBBZ
		uniqs <- union(X, Y)
		H <- t(X == uniqs[1]) %*% (Y == uniqs[1])
		for ( uniq in uniqs[-1] ) {
		H <- H + t(X == uniq) %*% (Y == uniq)
		}
		nrow(X) - H
		}

		hammingFilter<-function(umiseq, edit=1){
		ham_mat <- function(umistrings) {
		X<- matrix(unlist(strsplit(umistrings, "")),ncol = length(umistrings))
		#function below thanks to Johann de Jong
		#https://goo.gl/u8RBBZ
		uniqs <- unique(as.vector(X))
		U <- X == uniqs[1]
		H <- t(U) %*% U
		for ( uniq in uniqs[-1] ) {
		U <- X == uniq
		H <- H + t(U) %*% U
		}
		nrow(X) - H
		}
		library(dplyr) #necessary for pipe to work within multidplyr workers
		# umiseq a vector of umis, one per read
		umiseq <- sort(umiseq)
		uc <- data.frame(us = umiseq) %>% dplyr::count(us) # normal UMI counts
		uc <- data.frame(us = umiseq,stringsAsFactors = F) %>% dplyr::count(us) # normal UMI counts

		if(length(uc$us)>1 && length(uc$us)<100000){ #prevent use of > 100Gb RAM
		umi <- stringdist::stringdistmatrix(uc$us,method="hamming",useNames = "strings",nthread=1) %>% #only 1 core for each multidplyr worker
		broom::tidy() %>%
		dplyr::filter( distance <= edit ) %>% # only remove below chosen dist
		dplyr::left_join(uc, by = c( "item1" = "us")) %>%
		dplyr::left_join(uc, by = c( "item2" = "us"), suffix =c(".1",".2")) %>%
		dplyr::transmute( rem=if_else( n.1>=n.2, item2, item1 )) %>% #discard the UMI with fewer reads
		unique()
		Sys.time()
		umi <- ham_mat(uc$us) #construct pairwise UMI distances
		umi[upper.tri(umi,diag=T)] <- NA #remove upper triangle of the output matrix
		umi <- reshape2::melt(umi, varnames = c('row', 'col'), na.rm = TRUE) %>% dplyr::filter( value <= edit ) #make a long data frame and filter according to cutoff
		umi$n.1 <- uc[umi$row,]$n #add in observed freq
		umi$n.2 <- uc[umi$col,]$n#add in observed freq
		umi <- umi %>%dplyr::transmute( rem=if_else( n.1>=n.2, col, row )) %>% unique() #discard the UMI with fewer reads
		if(nrow(umi)>0){
		uc <- uc[-match(umi$rem,uc$us),] #discard all filtered UMIs
		uc <- uc[-umi$rem,] #discard all filtered UMIs
		}
		}
		n <- nrow(uc)
		@@ -284,9 +312,16 @@ makeGEprofile <- function(abamfile,ubamfile,bcfile,safannot,ncores,stra,bcstart,
		}
		## XC binning below
		if(XCbin != 0){
		XC_obs<-unique(reads$XC)
		if(length(XC_obs)*length(bc$V1)> 1e+10){
		print("There are too many noisy barcodes, binning will be skipped")
		}else{
		print(paste("I am binning cell barcodes within hamming distance ",XCbin,sep=""))
		binmat <- stringdist::stringdistmatrix(bc$V1,unique(reads$XC),method="hamming",useNames = "strings",nthread=ncores)
		paste("This may be slow, depending on the number of reads")
		binmat <- ham_twomats(bc$V1,XC_obs)
		tmp <- reshape2::melt(binmat) %>% dplyr::mutate_if(is.factor, as.character) %>% dplyr::filter(value>0 & value <=XCbin)
		tmp$Var1<-bc$V1[tmp$Var1]
		tmp$Var2<-XC_obs[tmp$Var2]
		if(XCbin > 1){ #if there are conflicts we can choose the closer ones for dists >1
		dups <- tmp$Var2[duplicated(tmp$Var2)]
		for(i in dups){
		@@ -314,12 +349,13 @@ makeGEprofile <- function(abamfile,ubamfile,bcfile,safannot,ncores,stra,bcstart,
		}
		saveRDS(object = reads,file = paste(out,"/zUMIs_output/expression/",sn,".XCbinned.tbl.rds",sep=""))
		}
		}
		## end XC binning

		if(HamDist==0){
		umicounts <- reads %>% dplyr::filter((XC %in% bc$V1) & (!is.na(GE))) %>% dplyr::group_by(XC,GE) %>% dplyr::summarise(umicount=length(unique(XM)),readcount=length(XM))
		}else{
		cluster <- create_cluster(ncores) # The clustering seems to have issue in partition when there is not enough data to spread on all the cores.
		cluster <- create_cluster(ncores) # The clustering may have issues in partition when there is not enough data to spread on all the cores.
		set_default_cluster(cluster)
		cluster_copy(cluster, hammingFilter)
		cluster_copy(cluster, HamDist)

zUMIs-master.sh

+1 −1

Original line number	Diff line number	Diff line
		@@ -3,7 +3,7 @@
		# Pipeline to run UMI-seq analysis from fastq to read count tables.
		# Authors: Swati Parekh & Christoph Ziegenhain
		# Contact: parekh@bio.lmu.de or christoph.ziegenhain@ki.se or hellmann@bio.lmu.de
		vers=0.0.4
		vers=0.0.5
		function check_opts() {
		value=$1
		name=$2

zUMIs-noslurm.sh

+27 −22

Original line number	Diff line number	Diff line
		@@ -40,6 +40,28 @@ XCbin="${36}"
		pbcfastq="${37}"
		pbcrange="${38}"

		if [[ "$pbcfastq" != "NA" ]] ; then
		tmpa=`echo $pbcrange \| cut -f1 -d '-'`
		tmpb=`echo $pbcrange \| cut -f2 -d '-'`
		pbcl=`expr $tmpa + $tmpb - 1`
		tmpa=`echo $xc \| cut -f1 -d '-'`
		tmpb=`echo $xc \| cut -f2 -d '-'`
		bcl=`expr $tmpa + $tmpb - 1`
		l=`expr $bcl + $pbcl`
		xcst=1
		xcend=$l
		xmst=`expr $l + 1`
		tmpa=`echo $xm \| cut -f1 -d '-'`
		tmpb=`echo $xm \| cut -f2 -d '-'`
		ml=`expr $tmpb - $tmpa`
		xmend=`expr $xmst + $ml`
		xmr="$xmst"-"$xmend"
		xcr="$xcst"-"$xcend"
		else
		xmr=$xm
		xcr=$xc
		fi


		if [[ "$isstrt" == "no" ]] ; then
		rl=`expr $r - 1`
		@@ -49,10 +71,10 @@ else
		fi

		if [[ "$isstrt" == "no" ]] ; then
		xcst=`echo $xc \| cut -f1 -d '-'`
		xcend=`echo $xc \| cut -f2 -d '-'`
		xmst=`echo $xm \| cut -f1 -d '-'`
		xmend=`echo $xm \| cut -f2 -d '-'`
		xcst=`echo $xcr \| cut -f1 -d '-'`
		xcend=`echo $xcr \| cut -f2 -d '-'`
		xmst=`echo $xmr \| cut -f1 -d '-'`
		xmend=`echo $xmr \| cut -f2 -d '-'`
		else
		xcst=1
		a=`echo $xc2 \| cut -f2 -d '-'`
		@@ -63,6 +85,7 @@ else
		xmend=`expr $c + $xmst`
		fi


		re='^[0-9]+$'

		case "$whichStage" in
		@@ -73,24 +96,6 @@ re='^[0-9]+$'
		perl $zumisdir/fqfilter-inDrops.pl $f1 $f2 $libread $f3 $cq $cbq $mq $mbq $xm $t $sn $o $pigzexc
		else
		perl $zumisdir/fqfilter.pl $f2 $f1 $pbcfastq $cq $cbq $mq $mbq $xc $pbcrange $xm $t $sn $o $pigzexc
		if [[ "$pbcfastq" != "NA" ]] ; then
		tmpa=`echo $pbcrange \| cut -f1 -d '-'`
		tmpb=`echo $pbcrange \| cut -f2 -d '-'`
		pbcl=`expr $tmpa + $tmpb - 1`
		tmpa=`echo $xc \| cut -f1 -d '-'`
		tmpb=`echo $xc \| cut -f2 -d '-'`
		bcl=`expr $tmpa + $tmpb - 1`
		l=`expr $bcl + $pbcl`
		xc=1-"$l"
		xcst=1
		xcend=$l
		xmst=`expr $l + 1`
		tmpa=`echo $xm \| cut -f1 -d '-'`
		tmpb=`echo $xm \| cut -f2 -d '-'`
		ml=`expr $tmpb - $tmpa`
		xmend=`expr $xmst + $ml`
		xm="$xmst"-"$xmend"
		fi
		fi

		$starexc --genomeDir $g --runThreadN $t --readFilesCommand zcat --sjdbGTFfile $gtf --outFileNamePrefix $o/$sn. --outSAMtype BAM Unsorted --outSAMmultNmax 1 --outFilterMultimapNmax 50 --outSAMunmapped Within --sjdbOverhang $rl --twopassMode Basic --readFilesIn $o/$sn.cdnaread.filtered.fastq.gz $x

Admin message