Commit c06bcaa0 authored by cziegenhain's avatar cziegenhain
Browse files

zUMIs2.9.0: speed improvements

parent 991165d5
Loading
Loading
Loading
Loading
+3 −1
Original line number Diff line number Diff line
@@ -8,7 +8,7 @@ git clone https://github.com/sdparekh/zUMIs.git`
```
Start anaylysing your data:
```
zUMIs/zUMIs-master.sh -c -y my_config.yaml
zUMIs/zUMIs.sh -c -y my_config.yaml
```
zUMIs now comes with its own [miniconda](https://docs.conda.io/en/latest/miniconda.html) environment, so you do not need to deal with dependencies or installations (use the -c flag to use conda).
If you wish to use your own dependencies, head to the [zUMIs wiki](https://github.com/sdparekh/zUMIs/wiki/Installation#dependencies) to see what is required.
@@ -29,6 +29,8 @@ We provide a script to convert zUMIs output into loom file automatically based o
zUMIs will try to automatically do this, otherwise convert zUMIs output to loom by simply running `Rscript rds2loom.R myRun.yaml`.

## Changelog
05 July 2020: [zUMIs2.9.0 released](https://github.com/sdparekh/zUMIs/releases/tag/2.7.0): Speed up STAR read mapping by parallel instances if enough resources are available. Change the main zUMIs script to `zUMIs.sh`. Speed up and reduce clutter by loading reads from bam files using parallelised Rsamtools calls instead of printing temporary text files. Speed up counting by parallelising exon / intron / exon+intron counting as well as downsamplings. Speed up by parallelising creation of wide format count matrices. 

23 June 2020: zUMIs2.8.3: Merged code contribution from @gringer: prevent errors by emitting SAM headers in chunked unmapped .bam file output of fqfilter. Changed call to STAR to prevent stalling of samtools pipe. 

18 May 2020: zUMIs2.8.2: Added `merge_demultiplexed_fastq.R` to concatenate previously demultiplexed fastq files. For usage details see here: https://github.com/sdparekh/zUMIs/wiki/Starting-from-demultiplexed-fastq-files
+52 −91
Original line number Diff line number Diff line
@@ -56,96 +56,55 @@ ham_mat <- function(umistrings) {
  nrow(X) - H
}

prep_samtools <- function(featfile,bccount,inex,cores,samtoolsexc){
  print("Extracting reads from bam file(s)...")

  nchunks <- length(unique(bccount$chunkID))
  all_rgfiles <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".RGgroup.",1:nchunks,".txt")


  for(i in unique(bccount$chunkID)){
    rgfile <- all_rgfiles[i]
    chunks <- bccount[chunkID==i]$XC
    write.table(file=rgfile,chunks,col.names = F,quote = F,row.names = F)
  }

reads2genes_new <- function(featfile, bccount, inex, chunk, cores, keepUnassigned = FALSE){
  chunk_bcs <- bccount[chunkID==chunk]$XC
  idxstats <- Rsamtools::idxstatsBam(featfile)
  taglist <- c("BC", "UB","GE")
  if(inex){
    headerXX <- paste( c(paste0("V",1:4)) ,collapse="\t")
  }else{
    headerXX <- paste( c(paste0("V",1:3)) ,collapse="\t")
    taglist <- c(taglist, "GI")
  }
  write(headerXX,"freadHeader")

  headercommand <- "cat freadHeader > "
  layoutflag <- ifelse(opt$read_layout == "PE", "-f 0x0040", "")
  samcommand <- paste(samtoolsexc," view -x QB -x QU -x BX -x NH -x AS -x nM -x HI -x IH -x NM -x uT -x MD -x jM -x jI -x XN -x XS -x UX -x ES -x EN -x IS -x IN", layoutflag, "-@")

  outfiles <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".tmp.",1:nchunks,".txt")
  system(paste(headercommand,outfiles,collapse = "; "))

  cpusperchunk <- round(cores/nchunks,0)

  if(inex == FALSE){
    grepcommand <- " | cut -f12,13,14 | sed 's/BC:Z://' | sed 's/GE:Z://' | sed 's/UB:Z://' | grep -F -f "
    ex_cmd <- paste(samcommand,cpusperchunk,featfile,grepcommand,all_rgfiles,">>",outfiles," & ",collapse = " ")
  
    system(paste(ex_cmd,"wait"))
  rsamtools_reads <- mclapply(1:nrow(idxstats), function(x) {
    if(opt$read_layout == "PE"){
      parms <- ScanBamParam(tag=taglist, 
                            what="pos", 
                            flag = scanBamFlag(isFirstMateRead = TRUE),
                            tagFilter = list(BC = chunk_bcs),
                            which = GRanges(seqnames = idxstats[x,"seqnames"], ranges = IRanges(1,idxstats[x,"seqlength"])))
    }else{
    grepcommand <- " | cut -f12,13,14,15 | sed 's/BC:Z://' | sed 's/Z://g' | grep -F -f "
    inex_cmd <- paste(samcommand,cpusperchunk,featfile,grepcommand,all_rgfiles,">>",outfiles," & ",collapse = " ")

    system(paste(inex_cmd,"wait"))
      parms <- ScanBamParam(tag=taglist, 
                            what="pos", 
                            tagFilter = list(BC = chunk_bcs),
                            which = GRanges(seqnames = idxstats[x,"seqnames"], ranges = IRanges(1,idxstats[x,"seqlength"])))
    }
  system("rm freadHeader")
  system(paste("rm",all_rgfiles))
    
  return(outfiles)
    dat <- scanBam(file = featfile, param = parms)
    if(inex){
      dt <- data.table(RG = dat[[1]]$tag$BC, UB = dat[[1]]$tag$UB, GE = dat[[1]]$tag$GE, GEin = dat[[1]]$tag$GI)
    }else{
      dt <- data.table(RG = dat[[1]]$tag$BC, UB = dat[[1]]$tag$UB, GE = dat[[1]]$tag$GE)
    }

reads2genes <- function(inex,chunkID, keepUnassigned = FALSE){

  samfile <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".tmp.",chunkID,".txt")

   if(inex==FALSE){
     reads<-data.table::fread(samfile, na.strings=c(""),
                              select=c(1,2,3),header=T,fill=T,colClasses = "character" , col.names = c("RG","GE","UB") )[
                              ,"ftype":="NA"
                              ][is.na(GE)==F,  ftype:="exon"]
    return(dt)
  }, mc.cores = cores)
  rsamtools_reads <- rbindlist(rsamtools_reads, fill = TRUE, use.names = TRUE)
  if(inex){
    rsamtools_reads[ , ftype :="NA"][
       is.na(GEin)==F, ftype :="intron"][
       is.na(GE)==F  , ftype:="exon"][
       is.na(GE)     , GE:=GEin][ 
                     ,GEin:=NULL ]
  }else{
    reads<-data.table::fread(samfile, na.strings=c(""),
                             select=c(1,2,3,4),header=T,fill=T,colClasses = "character" , col.names = c("RG","V2","V3","V4") )[
                                     , V2_id := substr(V2,1,2)
                                  ][ , V3_id := substr(V3,1,2)
                                  ][ , V4_id := substr(V4,1,2)
                                  ][ , V2 := substr(V2,4,nchar(V2))
                                  ][ , V3 := substr(V3,4,nchar(V3))
                                  ][ , V4 := substr(V4,4,nchar(V4))
                                  ][ V2_id == "GE", GE := V2
                                  ][ V2_id == "GI", GEin := V2
                                  ][ V2_id == "UB", UB := V2
                                  ][ V3_id == "GE", GE := V3
                                  ][ V3_id == "GI", GEin := V3
                                  ][ V3_id == "UB", UB := V3
                                  ][ V4_id == "UB", UB := V4
                                  ][ V4_id == "GI", GEin := V4
                                  ][ ,c("V2_id","V3_id","V4_id","V2","V3","V4") := NULL
                                  ][ ,"ftype":="NA"
                                  ][is.na(GEin)==F,ftype:="intron"
                                  ][is.na(GE)==F,  ftype:="exon"
                                  ][is.na(GE),GE:=GEin
                                  ][ ,GEin:=NULL ]

    rsamtools_reads[, ftype :="NA"][
        is.na(GE)==F, ftype :="exon"]
  }
  system(paste("rm",samfile))

  setkey(reads,RG)
  setkey(rsamtools_reads,RG)
  
  if(keepUnassigned){
    return( reads )
    return( rsamtools_reads )
  }else{
    return( reads[GE!="NA"] )
    return( rsamtools_reads[GE!="NA"] )
  }

}

hammingFilter<-function(umiseq, edit=1, gbcid=NULL){
@@ -322,19 +281,19 @@ check_nonUMIcollapse <- function(seqfiles){
collectCounts<-function(reads,bccount,subsample.splits, mapList, ...){
  subNames<-paste("downsampled",rownames(subsample.splits),sep="_")
  umiFUN<-"umiCollapseID"
  lapply(mapList,function(tt){
  parallel::mclapply(mapList,function(tt){
    ll<-list( all=umiFUNs[[umiFUN]](reads=reads,
                                bccount=bccount,
                                ftype=tt),
              downsampling=lapply( 1:nrow(subsample.splits) , function(i){
              downsampling=parallel::mclapply( 1:nrow(subsample.splits) , function(i){
                umiFUNs[[umiFUN]](reads,bccount,
                                  nmin=subsample.splits[i,1],
                                  nmax=subsample.splits[i,2],
                                  ftype=tt)} )
                                  ftype=tt)}, mc.cores = floor(opt$num_threads/length(mapList)) )
    )
    names(ll$downsampling)<-subNames
    ll
  })
  }, mc.cores = length(mapList))

}

@@ -349,16 +308,18 @@ bindList<-function(alldt,newdt){
}

convert2countM<-function(alldt,what){
  fmat<-alldt
  fmat<-copy(alldt)
  for( i in 1:length(alldt)){
    fmat[[i]][[1]]<-.makewide(alldt[[i]][[1]],what)
    for(j in names(alldt[[i]][[2]])){
      fmat[[i]][[2]][[j]]<-.makewide(alldt[[i]][[2]][[j]],what)
    }
    fmat[[i]][[2]] <- fmat[[i]][[2]][sapply(fmat[[i]][[2]], function(x) nrow(x)>0)]
    downsamp_names <- names(fmat[[i]][[2]])
    fmat[[i]][[2]] <- parallel::mclapply(downsamp_names, function(x){
      .makewide(alldt[[i]][[2]][[x]],what)
    }, mc.cores = opt$num_threads)
    names(fmat[[i]][[2]]) <- downsamp_names
  }
  return(fmat)
}

write_molecule_mapping <- function(mm){
  mm_path <- paste0(opt$out_dir,"/zUMIs_output/molecule_mapping/")
  bcs <- unique(mm$BC)
+21 −17
Original line number Diff line number Diff line
@@ -3,6 +3,7 @@ library(methods)
library(data.table)
library(yaml)
library(ggplot2)
suppressPackageStartupMessages(library(Rsamtools))

##########################
myYaml <- commandArgs(trailingOnly = T)
@@ -126,15 +127,22 @@ if(opt$counting_opts$Ham_Dist == 0){
    dir.create( paste0(opt$out_dir,"/zUMIs_output/molecule_mapping/") )
  }

  samouts <- prep_samtools(featfile  = outbamfile,
                           bccount   = bccount,
                           inex      = opt$counting_opts$introns,
                           cores     = opt$num_threads,
                           samtoolsexc=samtoolsexc)
  tmpbamfile <- outbamfile
  outbamfile <- paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.sorted.bam")
  print("Coordinate sorting intermediate bam file...")
  sort_cmd <- paste0(samtoolsexc," sort -O 'BAM' -@ ",opt$num_threads," -m ",mempercpu,"G -o ",outbamfile," ",tmpbamfile)
  system(sort_cmd)
  index_cmd <- paste(samtoolsexc,"index -@",opt$num_threads,outbamfile)
  system(index_cmd)
  system(paste0("rm ",tmpbamfile))
  
  for(i in unique(bccount$chunkID)){
    print( paste( "Hamming distance collapse in barcode chunk", i, "out of",length(unique(bccount$chunkID)) ))
    reads<-reads2genes( inex = opt$counting_opts$introns,
                        chunkID  = i )
    reads <- reads2genes_new(featfile = outbamfile,
                             bccount  = bccount,
                             inex     = opt$counting_opts$introns,
                             chunk    = i,
                             cores    = opt$num_threads)
    reads <- reads[!UB==""] #make sure only UMI-containing reads go further
    u <- umiCollapseHam(reads,bccount, HamDist=opt$counting_opts$Ham_Dist)
  }
@@ -144,7 +152,7 @@ if(opt$counting_opts$Ham_Dist == 0){
  outbamfile <- correct_UB_tags(bccount, samtoolsexc)
  sortbamfile <-paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.UBcorrected.sorted.bam")
}
paste("Coordinate sorting final bam file...")
print("Coordinate sorting final bam file...")
sort_cmd <- paste0(samtoolsexc," sort -O 'BAM' -@ ",opt$num_threads," -m ",mempercpu,"G -o ",sortbamfile," ",outbamfile)
system(sort_cmd)
index_cmd <- paste(samtoolsexc,"index -@",opt$num_threads,sortbamfile)
@@ -177,18 +185,14 @@ if( opt$counting_opts$introns ){

########################## assign reads to UB & GENE

samouts <- prep_samtools(featfile  = sortbamfile,
                         bccount   = bccount,
                         inex      = opt$counting_opts$introns,
                         cores     = opt$num_threads,
                         samtoolsexc=samtoolsexc)

for(i in unique(bccount$chunkID)){
     print( paste( "Working on barcode chunk", i, "out of",length(unique(bccount$chunkID)) ))
     print( paste( "Processing",length(bccount[chunkID==i]$XC), "barcodes in this chunk..." ))
     reads<-reads2genes( inex = opt$counting_opts$introns,
                          chunkID  = i )

     reads <- reads2genes_new(featfile = sortbamfile,
                              bccount  = bccount,
                              inex     = opt$counting_opts$introns,
                              chunk    = i,
                              cores    = opt$num_threads)

     tmp<-collectCounts(  reads =reads,
                          bccount=bccount[chunkID==i],
+52 −17
Original line number Diff line number Diff line
@@ -10,6 +10,10 @@ additional_fq <- inp$reference$additional_files
samtools <- inp$samtools_exec
STAR_exec <- inp$STAR_exec

if(is.null(inp$mem_limit) | inp$mem_limit == 0){
  inp$mem_limit <- 100
}

# collect filtered bam files ----------------------------------------------
tmpfolder <- paste(inp$out_dir,"/zUMIs_output/.tmpMerge/",sep="")
if(inp$which_Stage == "Filtering"){
@@ -21,6 +25,12 @@ if(inp$which_Stage == "Filtering"){
}


# check if multiple STAR instances can be run -----------------------------

genome_size <- system(command = paste("du -sh",inp$reference$STAR_index,"| cut -f1"), intern = TRUE)
genome_size <- as.numeric(gsub(pattern = "G",replacement = "", x = genome_size))
num_star_instances <- floor(inp$mem_limit/genome_size)

# GTF file setup ----------------------------------------------------------
#in case of additional sequences, we need to create a custom GTF

@@ -77,24 +87,18 @@ cDNA_read_length <- getmode(nchar(cDNA_peek$V1))
# Setup STAR mapping ------------------------------------------------------
samtools_load_cores <- ifelse(inp$num_threads>8,2,1)
avail_cores <- inp$num_threads - samtools_load_cores #reserve threads for samtools file opening
#if(avail_cores < 3){
#  cores_samtools <- 1
#  cores_star <- 2
#}else{
#  cores_samtools <- floor(avail_cores/3)
#  cores_star <- avail_cores - cores_samtools
#}
if(inp$which_Stage == "Filtering"){
  avail_cores <- floor(avail_cores / num_star_instances)
}

if(avail_cores < 2){
  avail_cores = 1
}

#param_defaults <- paste("--readFilesCommand ",samtools," view -@",samtools_load_cores," --outSAMmultNmax 1 --outFilterMultimapNmax 50 --outSAMunmapped Within --outSAMtype SAM --outStd SAM")
param_defaults <- paste("--readFilesCommand ",samtools," view -@",samtools_load_cores," --outSAMmultNmax 1 --outFilterMultimapNmax 50 --outSAMunmapped Within --outSAMtype BAM Unsorted")
param_defaults <- paste("--readFilesCommand ",samtools," view -@",samtools_load_cores," --outSAMmultNmax 1 --outFilterMultimapNmax 50 --outSAMunmapped Within --outSAMtype BAM Unsorted --quantMode TranscriptomeSAM")
param_misc <- paste("--genomeDir",inp$reference$STAR_index,
                    "--sjdbGTFfile",gtf_to_use,
                    "--runThreadN",avail_cores,
                    "--readFilesIn",paste0(filtered_bams,collapse=","),
                    "--outFileNamePrefix",paste(inp$out_dir,"/",inp$project,".filtered.tagged.",sep=""),
                    "--sjdbOverhang", cDNA_read_length-1,
                    "--readFilesType SAM",inp$read_layout)

@@ -103,14 +107,45 @@ if(inp$counting_opts$twoPass==TRUE){
  STAR_command <- paste(STAR_command,"--twopassMode Basic")
}

#samtools_output <- paste0(" | ",samtools," view -@ ",cores_samtools," -o ",inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.out.bam")
#STAR_command <- paste(STAR_command,samtools_output)

#finally, run STAR
if(num_star_instances>1 & inp$which_Stage == "Filtering"){
  map_tmp_dir <- paste0(inp$out_dir,"/zUMIs_output/.tmpMap/")
  dir.create(path = map_tmp_dir)
  input_split <- split(filtered_bams, ceiling(seq_along(filtered_bams) / ceiling(length(filtered_bams) / num_star_instances)))
  input_split <- sapply(input_split, paste0, collapse = ",")
  STAR_preset <- STAR_command
  STAR_command <- lapply(seq(num_star_instances), function(x){
    paste(STAR_preset,
      "--readFilesIn",input_split[x],
      "--outFileNamePrefix",paste0(map_tmp_dir,"/tmp.",inp$project,".",x,"."))
  })
  STAR_command <- paste(unlist(STAR_command), collapse = " & ")
  system(paste(STAR_command,"&",sammerge_command,"& wait"))
  
  #after parallel instance STAR, collect output data in the usual file places
  out_logs <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Log.final.out"), full = TRUE)
  merge_logs <- paste("cat",paste(out_logs, collapse = " "),">",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Log.final.out"))
  out_bams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Aligned.out.bam"), full = TRUE)
  merge_bams <- paste(inp$samtools_exec,"cat -o",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.out.bam"),paste(out_bams, collapse = " "))
  out_txbams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Aligned.toTranscriptome.out.bam"), full = TRUE)
  merge_txbams <- paste(inp$samtools_exec,"cat -o",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.toTranscriptome.out.bam"),paste(out_txbams, collapse = " "))
  system(paste(merge_logs,"&",merge_bams,"&",merge_txbams,"& wait"))
  system(paste0("rm ", map_tmp_dir, "tmp.", inp$project, ".*"))
}else{
  STAR_command <- paste(STAR_command,
    "--readFilesIn",paste0(filtered_bams,collapse=","),
    "--outFileNamePrefix",paste(inp$out_dir,"/",inp$project,".filtered.tagged.",sep="")
  )
  if(inp$which_Stage == "Filtering"){
    system(paste(STAR_command,"&",sammerge_command,"& wait"))
  system(paste0("rm ",tmpfolder,"/",inp$project,".*"))
  }else{
    system(STAR_command)
  }
}


#clean up chunked bam files
if(inp$which_Stage == "Filtering"){
  system(paste0("rm ",tmpfolder,"/",inp$project,".*"))
}
q()
+2 −2
Original line number Diff line number Diff line
@@ -3,7 +3,7 @@
# Pipeline to run UMI-seq analysis from fastq to read count tables.
# Authors: Swati Parekh, Christoph Ziegenhain, Beate Vieth & Ines Hellmann
# Contact: sparekh@age.mpg.de or christoph.ziegenhain@ki.se
vers=2.8.3
vers=2.9.0
currentv=$(curl -s https://raw.githubusercontent.com/sdparekh/zUMIs/master/zUMIs-master.sh | grep '^vers=' | cut -f2 -d "=")
if [ "$currentv" != "$vers" ] ; then
    echo -e "------------- \n\n Good news! A newer version of zUMIs is available at https://github.com/sdparekh/zUMIs \n\n-------------";
@@ -75,7 +75,7 @@ check_opts "${yaml}" "YAML" "-y"

# create temporary YAML file for corrected options
yaml_orig=${yaml}
yaml=$(dirname ${yaml})/$(basename ${yaml} .yaml).corrected.yaml
yaml=$(dirname ${yaml})/$(basename ${yaml} .yaml).run.yaml
cp ${yaml_orig} ${yaml}

#now get some variables from YAML