Commit afa0322b authored by ziegenhain@bio.lmu.de's avatar ziegenhain@bio.lmu.de
Browse files

optimize hamming distance functionality

parent 46dd3e6b
Loading
Loading
Loading
Loading
+78 −42
Original line number Diff line number Diff line
@@ -46,7 +46,7 @@ opt = parse_args(opt_parser);

print("I am loading useful packages...")
print(Sys.time())
packages <-c("multidplyr","dplyr","tidyr","broom","stringdist","reshape2","data.table","optparse","parallel","methods","GenomicRanges","GenomicFeatures","GenomicAlignments","AnnotationDbi","ggplot2","cowplot","tibble","mclust","Matrix","Rsubread")
packages <-c("multidplyr","dplyr","tidyr","broom","reshape2","data.table","optparse","parallel","methods","GenomicRanges","GenomicFeatures","GenomicAlignments","AnnotationDbi","ggplot2","cowplot","tibble","mclust","Matrix","Rsubread")
paks<-lapply(packages, function(x) suppressMessages(require(x, character.only = TRUE)))
rm(paks)

@@ -201,22 +201,50 @@ makeGEprofile <- function(abamfile,ubamfile,bcfile,safannot,ncores,stra,bcstart,
    return(rcfilt)
  }
  
  ham_twomats <- function(barcodes,XCstrings) {
    barcodes<-as.character(barcodes) #make sure this is a character, not a factor
    X<- matrix(unlist(strsplit(barcodes, "")),ncol = length(barcodes))
    Y<- matrix(unlist(strsplit(XCstrings, "")),ncol = length(XCstrings))
    
    #function below thanks to Johann de Jong
    #https://goo.gl/u8RBBZ
    uniqs <- union(X, Y)
    H <- t(X == uniqs[1]) %*% (Y == uniqs[1])
    for ( uniq in uniqs[-1] ) {
      H <- H + t(X == uniq) %*% (Y == uniq)
    }
    nrow(X) - H
  }

  hammingFilter<-function(umiseq, edit=1){
    ham_mat <- function(umistrings) {
      X<- matrix(unlist(strsplit(umistrings, "")),ncol = length(umistrings))
      #function below thanks to Johann de Jong
      #https://goo.gl/u8RBBZ
      uniqs <- unique(as.vector(X))
      U <- X == uniqs[1]
      H <- t(U) %*% U
      for ( uniq in uniqs[-1] ) {
        U <- X == uniq
        H <- H + t(U) %*% U
      }
      nrow(X) - H
    }
    library(dplyr) #necessary for pipe to work within multidplyr workers
    # umiseq a vector of umis, one per read
    umiseq <- sort(umiseq)
      uc     <- data.frame(us = umiseq) %>% dplyr::count(us) # normal UMI counts
    uc     <- data.frame(us = umiseq,stringsAsFactors = F) %>% dplyr::count(us) # normal UMI counts
    
    if(length(uc$us)>1 && length(uc$us)<100000){ #prevent use of > 100Gb RAM
        umi <- stringdist::stringdistmatrix(uc$us,method="hamming",useNames = "strings",nthread=1) %>% #only 1 core for each multidplyr worker
          broom::tidy() %>%
          dplyr::filter( distance <= edit  ) %>% # only remove below chosen dist
          dplyr::left_join(uc, by = c( "item1" = "us")) %>%
          dplyr::left_join(uc, by = c( "item2" = "us"), suffix =c(".1",".2")) %>%
          dplyr::transmute( rem=if_else( n.1>=n.2, item2, item1 )) %>% #discard the UMI with fewer reads
          unique()
      Sys.time()
      umi <-  ham_mat(uc$us) #construct pairwise UMI distances
      umi[upper.tri(umi,diag=T)] <- NA #remove upper triangle of the output matrix
      umi <- reshape2::melt(umi, varnames = c('row', 'col'), na.rm = TRUE) %>% dplyr::filter( value <= edit  ) #make a long data frame and filter according to cutoff
      umi$n.1 <- uc[umi$row,]$n #add in observed freq
      umi$n.2 <- uc[umi$col,]$n#add in observed freq
      umi <- umi %>%dplyr::transmute( rem=if_else( n.1>=n.2, col, row )) %>%  unique() #discard the UMI with fewer reads
      if(nrow(umi)>0){
          uc <- uc[-match(umi$rem,uc$us),] #discard all filtered UMIs
        uc <- uc[-umi$rem,] #discard all filtered UMIs
      }
    }
    n <- nrow(uc)
@@ -284,9 +312,16 @@ makeGEprofile <- function(abamfile,ubamfile,bcfile,safannot,ncores,stra,bcstart,
  }
  ## XC binning below
  if(XCbin != 0){
    XC_obs<-unique(reads$XC)
    if(length(XC_obs)*length(bc$V1)> 1e+10){
      print("There are too many noisy barcodes, binning will be skipped")
    }else{
      print(paste("I am binning cell barcodes within hamming distance ",XCbin,sep=""))
    binmat <- stringdist::stringdistmatrix(bc$V1,unique(reads$XC),method="hamming",useNames = "strings",nthread=ncores)
      paste("This may be slow, depending on the number of reads")
      binmat <- ham_twomats(bc$V1,XC_obs)
      tmp <- reshape2::melt(binmat) %>% dplyr::mutate_if(is.factor, as.character) %>% dplyr::filter(value>0 & value <=XCbin)
      tmp$Var1<-bc$V1[tmp$Var1]
      tmp$Var2<-XC_obs[tmp$Var2]
      if(XCbin > 1){ #if there are conflicts we can choose the closer ones for dists >1
        dups <- tmp$Var2[duplicated(tmp$Var2)]
        for(i in dups){
@@ -314,12 +349,13 @@ makeGEprofile <- function(abamfile,ubamfile,bcfile,safannot,ncores,stra,bcstart,
      }
      saveRDS(object = reads,file = paste(out,"/zUMIs_output/expression/",sn,".XCbinned.tbl.rds",sep=""))
    }
  }
  ## end XC binning

  if(HamDist==0){
    umicounts <- reads %>% dplyr::filter((XC %in% bc$V1) & (!is.na(GE))) %>% dplyr::group_by(XC,GE) %>% dplyr::summarise(umicount=length(unique(XM)),readcount=length(XM))
  }else{
    cluster <- create_cluster(ncores) # The clustering seems to have issue in partition when there is not enough data to spread on all the cores.
    cluster <- create_cluster(ncores) # The clustering may have issues in partition when there is not enough data to spread on all the cores.
    set_default_cluster(cluster)
    cluster_copy(cluster, hammingFilter)
    cluster_copy(cluster, HamDist)
+1 −1
Original line number Diff line number Diff line
@@ -3,7 +3,7 @@
# Pipeline to run UMI-seq analysis from fastq to read count tables.
# Authors: Swati Parekh &  Christoph Ziegenhain
# Contact: parekh@bio.lmu.de or christoph.ziegenhain@ki.se or hellmann@bio.lmu.de
vers=0.0.4
vers=0.0.5
function check_opts() {
    value=$1
    name=$2
+27 −22
Original line number Diff line number Diff line
@@ -40,6 +40,28 @@ XCbin="${36}"
pbcfastq="${37}"
pbcrange="${38}"

if [[ "$pbcfastq" != "NA" ]] ; then
 tmpa=`echo $pbcrange | cut -f1 -d '-'`
 tmpb=`echo $pbcrange | cut -f2 -d '-'`
 pbcl=`expr $tmpa + $tmpb - 1`
 tmpa=`echo $xc | cut -f1 -d '-'`
 tmpb=`echo $xc | cut -f2 -d '-'`
 bcl=`expr $tmpa + $tmpb - 1`
 l=`expr $bcl + $pbcl`
 xcst=1
 xcend=$l
 xmst=`expr $l + 1`
 tmpa=`echo $xm | cut -f1 -d '-'`
 tmpb=`echo $xm | cut -f2 -d '-'`
 ml=`expr $tmpb - $tmpa`
 xmend=`expr $xmst + $ml`
 xmr="$xmst"-"$xmend"
 xcr="$xcst"-"$xcend"
else
 xmr=$xm
 xcr=$xc
fi


if [[ "$isstrt" == "no" ]] ; then
	rl=`expr $r - 1`
@@ -49,10 +71,10 @@ else
fi

if [[ "$isstrt" == "no" ]] ; then
	xcst=`echo $xc | cut -f1 -d '-'`
	xcend=`echo $xc | cut -f2 -d '-'`
	xmst=`echo $xm | cut -f1 -d '-'`
	xmend=`echo $xm | cut -f2 -d '-'`
	xcst=`echo $xcr | cut -f1 -d '-'`
	xcend=`echo $xcr | cut -f2 -d '-'`
	xmst=`echo $xmr | cut -f1 -d '-'`
	xmend=`echo $xmr | cut -f2 -d '-'`
else
	xcst=1
	a=`echo $xc2 | cut -f2 -d '-'`
@@ -63,6 +85,7 @@ else
	xmend=`expr $c + $xmst`
fi


re='^[0-9]+$'

	case "$whichStage" in
@@ -73,24 +96,6 @@ re='^[0-9]+$'
				perl $zumisdir/fqfilter-inDrops.pl $f1 $f2 $libread $f3 $cq $cbq $mq $mbq $xm $t $sn $o $pigzexc
			else
				perl $zumisdir/fqfilter.pl $f2 $f1 $pbcfastq $cq $cbq $mq $mbq $xc $pbcrange $xm $t $sn $o $pigzexc
				if [[ "$pbcfastq" != "NA" ]] ; then
					tmpa=`echo $pbcrange | cut -f1 -d '-'`
					tmpb=`echo $pbcrange | cut -f2 -d '-'`
					pbcl=`expr $tmpa + $tmpb - 1`
					tmpa=`echo $xc | cut -f1 -d '-'`
					tmpb=`echo $xc | cut -f2 -d '-'`
					bcl=`expr $tmpa + $tmpb - 1`
					l=`expr $bcl + $pbcl`
					xc=1-"$l"
					xcst=1
					xcend=$l
					xmst=`expr $l + 1`
					tmpa=`echo $xm | cut -f1 -d '-'`
					tmpb=`echo $xm | cut -f2 -d '-'`
					ml=`expr $tmpb - $tmpa`
					xmend=`expr $xmst + $ml`
					xm="$xmst"-"$xmend"
				fi
			fi

			$starexc --genomeDir $g --runThreadN $t --readFilesCommand zcat --sjdbGTFfile $gtf --outFileNamePrefix $o/$sn. --outSAMtype BAM Unsorted --outSAMmultNmax 1 --outFilterMultimapNmax 50 --outSAMunmapped Within --sjdbOverhang $rl --twopassMode Basic --readFilesIn $o/$sn.cdnaread.filtered.fastq.gz $x