Commit 1e6ace96 authored by cziegenhain's avatar cziegenhain
Browse files

small improvements

parent eb0ea5f1
Loading
Loading
Loading
Loading
+1 −1
Original line number Original line Diff line number Diff line
@@ -31,7 +31,7 @@ We provide a script to convert zUMIs output into loom file automatically based o
zUMIs will try to automatically do this, otherwise convert zUMIs output to loom by simply running `Rscript rds2loom.R myRun.yaml`.
zUMIs will try to automatically do this, otherwise convert zUMIs output to loom by simply running `Rscript rds2loom.R myRun.yaml`.


## Changelog
## Changelog
17 Sept 2020: zUMIs.2.9.4b: Fix Smart-seq3 UMI read counting
18 Sept 2020: zUMIs.2.9.4b/c: Fix Smart-seq3 UMI read counting. Prevent crash when a chunk of cell BCs does not match any downsampling. Speed up barcode detection steps for some cases.


12 Sept 2020: [zUMIs2.9.4](https://github.com/sdparekh/zUMIs/releases/tag/2.9.4): Speed writing of error-corrected UMI tags to bam file up significantly. Prevent potential crash when no cells meet any user-defined downsampling criteria.
12 Sept 2020: [zUMIs2.9.4](https://github.com/sdparekh/zUMIs/releases/tag/2.9.4): Speed writing of error-corrected UMI tags to bam file up significantly. Prevent potential crash when no cells meet any user-defined downsampling criteria.


+30 −19
Original line number Original line Diff line number Diff line
@@ -282,8 +282,10 @@ check_nonUMIcollapse <- function(seqfiles){
}
}


collectCounts<-function(reads,bccount,subsample.splits, mapList, ...){
collectCounts<-function(reads,bccount,subsample.splits, mapList, ...){
  subNames<-paste("downsampled",rownames(subsample.splits),sep="_")
  umiFUN<-"umiCollapseID"
  umiFUN<-"umiCollapseID"
  
  if(nrow(subsample.splits)>0){
    subNames<-paste("downsampled",rownames(subsample.splits),sep="_")
    parallel::mclapply(mapList,function(tt){
    parallel::mclapply(mapList,function(tt){
      ll<-list( all=umiFUNs[[umiFUN]](reads=reads,
      ll<-list( all=umiFUNs[[umiFUN]](reads=reads,
                                      bccount=bccount,
                                      bccount=bccount,
@@ -299,17 +301,26 @@ collectCounts<-function(reads,bccount,subsample.splits, mapList, ...){
      names(ll$downsampling)<-subNames
      names(ll$downsampling)<-subNames
      ll
      ll
    }, mc.cores = length(mapList))
    }, mc.cores = length(mapList))

  }else{
    parallel::mclapply(mapList,function(tt){
      ll<-list( all=umiFUNs[[umiFUN]](reads=reads,
                                      bccount=bccount,
                                      ftype=tt))
      ll
    }, mc.cores = length(mapList))
  }
}
}




bindList<-function(alldt,newdt){
bindList<-function(alldt,newdt){
  for( i in names(alldt)){
  for( i in names(alldt)){
    alldt[[i]][[1]]<-rbind(alldt[[i]][[1]], newdt[[i]][[1]] )
    alldt[[i]][[1]]<-rbind(alldt[[i]][[1]], newdt[[i]][[1]] )
    if("downsampling" %in% names(newdt[[i]])){
      for(j in names(alldt[[i]][[2]])){
      for(j in names(alldt[[i]][[2]])){
        alldt[[i]][[2]][[j]]<-rbind(alldt[[i]][[2]][[j]],newdt[[i]][[2]][[j]])
        alldt[[i]][[2]][[j]]<-rbind(alldt[[i]][[2]][[j]],newdt[[i]][[2]][[j]])
      }
      }
    }
    }
  }
  return(alldt)
  return(alldt)
}
}


+25 −4
Original line number Original line Diff line number Diff line
@@ -162,8 +162,29 @@ setDownSamplingOption<-function( down ,bccount, filename=NULL){
  if(nchar(bccount[keep==TRUE,XC][1]) != nchar(bc_wl[1])){
  if(nchar(bccount[keep==TRUE,XC][1]) != nchar(bc_wl[1])){
    print("length of barcodes not equal to given barcode list, trying to match up...")
    print("length of barcodes not equal to given barcode list, trying to match up...")
    search_vector <-  bccount[keep==TRUE,XC]
    search_vector <-  bccount[keep==TRUE,XC]
    
    #first check if a partial barcode matches the length of the whitelist
    bc_definition <- sapply(opt$sequence_files, function(x) grep("BC", x$base_definition, value = T))
    if(length(bc_definition)>1){ #this only makes sense if there are at least 2 BC pieces defined
      bc_definition <- sapply(bc_definition, function(x) substr(x = x, start = 4, stop = nchar(x)-1))
      bc_len_mat <- t(matrix(as.numeric(unlist(strsplit(bc_definition, "-"))), ncol = length(bc_definition)))
      bc_lens <- bc_len_mat[,2] - bc_len_mat[,1] + 1 #parse barcode definition to get length of barcode pieces
      if(sum(bc_lens == nchar(bc_wl[1])) == 1){ #if one piece matches exactly the whitelist bc length
        match_piece <- which(bc_lens == nchar(bc_wl[1]))
        sub_start = 1
        sub_stop = bc_lens[match_piece]
        if(match_piece > 1){
          sub_start = sub_start + bc_lens[1:(match_piece-1)]
          sub_stop = sub_stop + bc_lens[1:(match_piece-1)]
        }
        XC_vector_substring <- substr(search_vector, start = sub_start, stop = sub_stop)
        bc_matched <- which(XC_vector_substring %in% bc_wl)
      }
    }else{
      bc_matched <- parallel::mcmapply(function(x) grep(pattern = x, x =search_vector), bc_wl, mc.cores = opt$num_threads, mc.preschedule = TRUE)
      bc_matched <- parallel::mcmapply(function(x) grep(pattern = x, x =search_vector), bc_wl, mc.cores = opt$num_threads, mc.preschedule = TRUE)
      bc_matched <- unlist(bc_matched)
      bc_matched <- unlist(bc_matched)
    }

    if(length(bc_matched)>0){
    if(length(bc_matched)>0){
      to_remove <- search_vector[-bc_matched]
      to_remove <- search_vector[-bc_matched]
      bccount[XC %in% to_remove,keep:=FALSE]
      bccount[XC %in% to_remove,keep:=FALSE]
@@ -242,8 +263,8 @@ BCbin <- function(bccount_file, bc_detected) {
  nocell_BCs <- nocell_bccount[,XC]
  nocell_BCs <- nocell_bccount[,XC]


  if(opt$barcodes$BarcodeBinning>0){
  if(opt$barcodes$BarcodeBinning>0){
    #break up in pieces of 1000 real BCs in case the hamming distance calculation gets too large!
    #break up in pieces of 2000 real BCs in case the hamming distance calculation gets too large!
    true_chunks <- split(true_BCs, ceiling(seq_along(true_BCs)/1000))
    true_chunks <- split(true_BCs, ceiling(seq_along(true_BCs)/2000))
    for(i in 1:length(true_chunks)){
    for(i in 1:length(true_chunks)){
      dists <- stringdist::stringdistmatrix(true_chunks[[i]],nocell_BCs,method="hamming", nthread = opt$num_threads)
      dists <- stringdist::stringdistmatrix(true_chunks[[i]],nocell_BCs,method="hamming", nthread = opt$num_threads)
      dists <- setDT(data.frame(dists))
      dists <- setDT(data.frame(dists))
+1 −1
Original line number Original line Diff line number Diff line
@@ -3,7 +3,7 @@
# Pipeline to run UMI-seq analysis from fastq to read count tables.
# Pipeline to run UMI-seq analysis from fastq to read count tables.
# Authors: Swati Parekh, Christoph Ziegenhain, Beate Vieth & Ines Hellmann
# Authors: Swati Parekh, Christoph Ziegenhain, Beate Vieth & Ines Hellmann
# Contact: sparekh@age.mpg.de or christoph.ziegenhain@ki.se
# Contact: sparekh@age.mpg.de or christoph.ziegenhain@ki.se
vers=2.9.4b
vers=2.9.4c
currentv=$(curl -s https://raw.githubusercontent.com/sdparekh/zUMIs/main/zUMIs.sh | grep '^vers=' | cut -f2 -d "=")
currentv=$(curl -s https://raw.githubusercontent.com/sdparekh/zUMIs/main/zUMIs.sh | grep '^vers=' | cut -f2 -d "=")
if [ "$currentv" != "$vers" ] ; then
if [ "$currentv" != "$vers" ] ; then
    echo -e "------------- \n\n Good news! A newer version of zUMIs is available at https://github.com/sdparekh/zUMIs \n\n-------------";
    echo -e "------------- \n\n Good news! A newer version of zUMIs is available at https://github.com/sdparekh/zUMIs \n\n-------------";