Commit 80b041ad authored by Christoph's avatar Christoph
Browse files

zUMIs2.8.0 update

parent 2f37151c
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -29,6 +29,8 @@ We provide a script to convert zUMIs output into loom file automatically based o
zUMIs will try to automatically do this, otherwise convert zUMIs output to loom by simply running `Rscript rds2loom.R myRun.yaml`.

## Changelog
02 May 2020: zUMIs2.8.0: Ensure correct behaviour when resuming from mapping (`which_stage: Mapping`). It is now possible to fractionally count reads overlapping multiple features (on the same strand if strand-specificity is used) `multi_overlap: yes`. New function for parsing the gene annotations: Fixed a problem with genes completely within other another gene's intron & added the possibility to extend genomic interval of exons (this is useful for crossmapping species with poor annotations). Added a probability calculation to compare the occurance of intronic reads to an expectation generated from background intergenic mapping reads (`intronProb: yes`). Added statistics output for the total number of reads observed per gene.

22 Apr 2020: zUMIs2.7.3: zUMIs will try to parse a geneID to gene name mapping file from the user provided GTF annotations.

27 Mar 2020: zUMIs2.7.2: New barcode handling functionalities: When using intersection of automatic BC detection and BC whitelist and the full barcode is composed out of several barcode pieces (eg. RT barcode + illumina barcode), the whitelist can now also just be corresponding to just one of the barcode pieces (eg. RT barcode only whitelist). Furthermore, some scRNA-seq protocols may have several cell barcodes that belong to the same cell (eg. SPLiT-seq with oligo-dT/random-hex round 1 barcode; i7 barcode mix in 10x Genomics). zUMIs now supports internally combing the counts via the `barcode_sharing:` option. Please look at the [wiki for further details](https://github.com/sdparekh/zUMIs/wiki/Barcodes#barcode-sharing-feature) and at [examples for some protocols](https://github.com/sdparekh/zUMIs/wiki/Protocol-specific-setup).
+128 −15
Original line number Diff line number Diff line
@@ -102,7 +102,7 @@ prep_samtools <- function(featfile,bccount,inex,cores,samtoolsexc){
  return(outfiles)
}

reads2genes <- function(inex,chunkID){
reads2genes <- function(inex,chunkID, keepUnassigned = FALSE){

  samfile <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".tmp.",chunkID,".txt")

@@ -140,9 +140,14 @@ reads2genes <- function(inex,chunkID){

  setkey(reads,RG)

  if(keepUnassigned){
    return( reads )
  }else{
    return( reads[GE!="NA"] )
  }

}

hammingFilter<-function(umiseq, edit=1, gbcid=NULL){
  # umiseq a vector of umis, one per read
  uc <- data.table(us = umiseq)[, .N, by = "us"] # normal UMI counts
@@ -241,11 +246,16 @@ ham_helper_fun <- function(x){
}

.makewide <- function(longdf,type){
  ge<-as.factor(longdf$GE)
  xc<-as.factor(longdf$RG)
  dt<-longdf[, list(GEu=unlist(strsplit(GE,","))), by=c("RG","GE",type)][ #this splits multioverlap gene lists by comma
             , cnt := get(type) / (stringr::str_count(GE,",")+1) ][
             , list(tot=sum(cnt)),by=c("RG","GEu")  ]

  ge<-as.factor(dt$GEu)
  xc<-as.factor(dt$RG)

  widedf <- Matrix::sparseMatrix(i=as.integer(ge),
                                 j=as.integer(xc),
                                 x=as.numeric(unlist(longdf[,type,with=F])),
                                 x=as.numeric(unlist(dt[,"tot",with=F])),
                                 dimnames=list(levels(ge), levels(xc)))
  return(widedf)
}
@@ -388,7 +398,7 @@ demultiplex_bam <- function(opt, bamfile, nBCs){
    dir.create( paste0(opt$out_dir,"/zUMIs_output/demultiplexed/") )
  }

  installed_py <- system("pip freeze", intern = TRUE)
  installed_py <- try(system("pip freeze", intern = TRUE))

  if(any(grepl("pysam==",installed_py))){
    print("Using python implementation to demultiplex.")
@@ -515,6 +525,26 @@ fixMissingOptions <- function(config){
    config$counting_opts$downsampling <- "0"
  }

  if(is.null(config$reference$exon_extension)){
    config$reference$exon_extension <- FALSE
  }

  if(is.null(config$reference$extension_length)){
    config$reference$extension_length <- 0
  }

  if(is.null(config$reference$scaffold_length_min)){
    config$reference$scaffold_length_min <- 0
  }

  if(is.null(config$counting_opts$multi_overlap)){
    config$counting_opts$multi_overlap <- FALSE
  }

  if(is.null(config$counting_opts$intronProb)){
    config$counting_opts$intronProb <- FALSE
  }

  return(config)
}

@@ -527,3 +557,86 @@ RPKM.calc <- function(exprmat, gene.length){
  y <- t(t(x)/lib.size)
  y/gene.length.kb
}


.intronProbability<-function(featfile,bccount,inex,cores,samtoolsexc,saf, allC){
  print("Fetching reads from bam files again to calculate intron scores...")
  
  nchunks <- length(unique(bccount$chunkID))
  all_rgfiles <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".RGgroup.",1:nchunks,".txt")
  
  for(i in unique(bccount$chunkID)){
    rgfile <- all_rgfiles[i]
    chunks <- bccount[chunkID==i]$XC
    write.table(file=rgfile,chunks,col.names = F,quote = F,row.names = F)
  }
  
  headerXX <- paste( c(paste0("V",1:3)) ,collapse="\t")
  write(headerXX,"freadHeader")
  
  headercommand <- "cat freadHeader > "
  layoutflag <- ifelse(opt$read_layout == "PE", "-f 0x0040", "")
  samcommand <- paste(samtoolsexc," view -x QB -x QU -x BX -x NH -x AS -x nM -x HI -x IH -x NM -x uT -x MD -x jM -x jI -x XN -x XS -x UX -x UB -x EN -x IN -x GE -x GI", layoutflag, "-@")
  
  outfiles <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".tmp.",1:nchunks,".txt")
  system(paste(headercommand,outfiles,collapse = "; "))
  
  cpusperchunk <- round(cores/nchunks,0)
  
  grepcommand <- " | cut -f12,13,14 | sed 's/BC:Z://' | sed 's/ES:Z://g' | sed 's/IS:Z://g' | grep -F -f "
  inex_cmd <- paste(samcommand,cpusperchunk,featfile,grepcommand,all_rgfiles,">>",outfiles," & ",collapse = " ")
  
  system(paste(inex_cmd,"wait"))
  system("rm freadHeader")
  system(paste("rm",all_rgfiles))
  
  #continue with reading the data in
  for(i in unique(bccount$chunkID)){
    print(paste("Working on barcode chunk", i, "out of", length(unique(bccount$chunkID))))
    print(paste("Processing", length(bccount[chunkID == i]$XC), "barcodes in this chunk..."))
    samfile <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".tmp.",i,".txt")
    
    reads<-data.table::fread(samfile, na.strings=c(""),
                             select=c(1,2,3),header=T,fill=T,colClasses = "character" , col.names = c("RG","ES","IS") )
    reads <- reads[ES == "Unassigned_NoFeatures" & IS == "Unassigned_NoFeatures"]
    system(paste("rm",samfile))  
    
    scores_out<-.calculateProbabilityScores(reads = reads,
                                            saf = saf,
                                            bccount = bccount[chunkID == i],
                                            allC = allC)
    if(i == 1){
      scores <- scores_out
    }else{
      scores <- rbind(scores, scores_out)
    }
  }
  return (scores)
}


.calculateProbabilityScores<-function(reads,saf,bccount,allC){
  #count number of intergenic reads per BC; use only intronic mapped reads
  dt <-reads[,.(intergenicPerBC = .N), by = RG]

  #get count values
  tmp <- merge(x= allC$exon$all, y = allC$intron$all, by=c("GE","RG"),suffixes=c(".ex",".in"),all=T)
  tmp <- tmp[,c("GE","RG","readcount.ex","readcount.in")]
  tmp[is.na(tmp)]<-0
  #tmp[readcount.in ==0,  readcount.in := 1] #??

  #get intronic bp per gene
  tmp<-merge(x=tmp,y=saf$intronsPerGene, by.x="GE", by.y="GeneID")
  
  
  dt<-merge(x=dt,y=tmp, by="RG", all.x=T)


  #lambda = (total number of intergenic reads in genome (per BC)/all intergenic bp in genome)*intronic bp per gene
  dt<-dt[, lambda:= (intergenicPerBC/saf$intergenicBp) * IntronLengthPerGene,][
         , prob:=ptpois(q=readcount.in ,lambda=lambda, lower.tail = F  ) , by= c("RG","GE")]

  setorder(dt, GE, RG)

  return (dt[,c("GE","RG","prob"), with = FALSE])
}
+328 −49

File changed.

Preview size limit exceeded, changes collapsed.

+13 −0
Original line number Diff line number Diff line
@@ -248,3 +248,16 @@ totReadBoxplot<-function(typeCount,fillcol){
              axis.title.x=element_blank())
  return(box)
}

sumGene <- function(counts){
  outlist <- lapply(names(counts$readcount), function(x){
    res <- Matrix::rowSums(counts$readcount[[x]]$all)
    out <- data.table(
      GeneID = names(res),
      nReads = as.numeric(res),
      ftype = x
    )
  })
  outdt <- rbindlist(outlist)
  return(outdt)
}
 No newline at end of file
+42 −16
Original line number Diff line number Diff line
@@ -50,28 +50,33 @@ bccount<-splitRG(bccount=bccount, mem= opt$mem_limit)
##############################################################
##### featureCounts

abamfile<-paste0(opt$out_dir,"/",opt$project,".filtered.tagged.Aligned.out.bam")
outbamfile <-paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.bam")

## gene annotation
saf<-.makeSAF(paste0(opt$out_dir,"/",opt$project,".final_annot.gtf"))
saf<-.makeSAF(gtf = paste0(opt$out_dir,"/",opt$project,".final_annot.gtf"),
              extension_var = opt$reference$exon_extension,
              exon_extension = opt$reference$extension_length,
              buffer_length = (opt$reference$extension_length / 2),
              scaff_length = opt$reference$scaffold_length_min,
              multi_overlap_var = opt$counting_opts$multi_overlap)
try(gene_name_mapping <- .get_gene_names(gtf = paste0(opt$out_dir,"/",opt$project,".final_annot.gtf"), threads = opt$num_threads), silent = TRUE)
try(data.table::fwrite(gene_name_mapping, file = paste0(opt$out_dir,"/zUMIs_output/expression/",opt$project,".gene_names.txt"), sep ="\t", quote = FALSE), silent = TRUE)
##

abamfile<-paste0(opt$out_dir,"/",opt$project,".filtered.tagged.Aligned.out.bam")
outbamfile <-paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.bam")

if(smart3_flag & opt$counting_opts$strand == 1){
  #split bam in UMU ends and internal
  tmp_bams <- split_bam(bam = abamfile, cpu = opt$num_threads, samtoolsexc=samtoolsexc)

  #assign features with appropriate strand
  fnex_int<-.runFeatureCount(tmp_bams[1], saf=saf$exons, strand=0, type="ex", primaryOnly = opt$counting_opts$primaryHit, cpu = opt$num_threads, mem = opt$mem_limit, fcounts_clib = fcounts_clib)
  fnex_umi<-.runFeatureCount(tmp_bams[2], saf=saf$exons, strand=1, type="ex", primaryOnly = opt$counting_opts$primaryHit, cpu = opt$num_threads, mem = opt$mem_limit, fcounts_clib = fcounts_clib)
  fnex_int<-.runFeatureCount(tmp_bams[1], saf=saf$exons, strand=0, type="ex", primaryOnly = opt$counting_opts$primaryHit, cpu = opt$num_threads, mem = opt$mem_limit, fcounts_clib = fcounts_clib, multi_overlap_var = opt$counting_opts$multi_overlap)
  fnex_umi<-.runFeatureCount(tmp_bams[2], saf=saf$exons, strand=1, type="ex", primaryOnly = opt$counting_opts$primaryHit, cpu = opt$num_threads, mem = opt$mem_limit, fcounts_clib = fcounts_clib, multi_overlap_var = opt$counting_opts$multi_overlap)
  ffiles_int <- paste0(fnex_int,".tmp")
  ffiles_umi <- paste0(fnex_umi,".tmp")

  if(opt$counting_opts$introns){
    fnin_int<-.runFeatureCount(ffiles_int, saf=saf$introns, strand=0, type="in", primaryOnly = opt$counting_opts$primaryHit, cpu = opt$num_threads, mem = opt$mem_limit, fcounts_clib = fcounts_clib)
    fnin_umi<-.runFeatureCount(ffiles_umi, saf=saf$introns, strand=1, type="in", primaryOnly = opt$counting_opts$primaryHit, cpu = opt$num_threads, mem = opt$mem_limit, fcounts_clib = fcounts_clib)
    fnin_int<-.runFeatureCount(ffiles_int, saf=saf$introns, strand=0, type="in", primaryOnly = opt$counting_opts$primaryHit, cpu = opt$num_threads, mem = opt$mem_limit, fcounts_clib = fcounts_clib, multi_overlap_var = opt$counting_opts$multi_overlap)
    fnin_umi<-.runFeatureCount(ffiles_umi, saf=saf$introns, strand=1, type="in", primaryOnly = opt$counting_opts$primaryHit, cpu = opt$num_threads, mem = opt$mem_limit, fcounts_clib = fcounts_clib, multi_overlap_var = opt$counting_opts$multi_overlap)
    ffiles_int <- paste0(fnin_int,".tmp")
    ffiles_umi <- paste0(fnin_umi,".tmp")
  }
@@ -86,7 +91,8 @@ if(smart3_flag & opt$counting_opts$strand == 1){
                         primaryOnly = opt$counting_opts$primaryHit,
                         cpu = opt$num_threads,
                         mem = opt$mem_limit,
                         fcounts_clib = fcounts_clib)
                         fcounts_clib = fcounts_clib,
                         multi_overlap_var = opt$counting_opts$multi_overlap)
  ffiles<-paste0(fnex,".tmp")

  if(opt$counting_opts$introns){
@@ -97,7 +103,8 @@ if(smart3_flag & opt$counting_opts$strand == 1){
                             primaryOnly = opt$counting_opts$primaryHit,
                             cpu = opt$num_threads,
                             mem = opt$mem_limit,
                             fcounts_clib = fcounts_clib)
                             fcounts_clib = fcounts_clib,
                             multi_overlap_var = opt$counting_opts$multi_overlap)
    system(paste0("rm ",fnex,".tmp"))
    ffiles<-paste0(fnin,".tmp")
  }
@@ -223,9 +230,28 @@ if(UMIcheck == "nonUMI" | smart3_flag == TRUE ){

saveRDS(final,file=paste(opt$out_dir,"/zUMIs_output/expression/",opt$project,".dgecounts.rds",sep=""))

#################
#Accessory processing scripts
################# #Accessory processing scripts
#Intron Probability Score Calculation
if(opt$counting_opts$intronProb == TRUE){
  if(opt$counting_opts$introns == FALSE){
    print("Intron information is needed to calculate Intron-Probability! Please change yaml settings counting_opts->introns to yes.\n Skipping probability calculation...")
  }else{
    library(extraDistr)
    print("Starting Intron-Probability Calculations...")

    # Extractingreads from sam files again, this could be sped up by integrating code further upstream of zUMIs-dge2
    genesWithIntronProb<-.intronProbability(bccount=bccount,
                                            featfile=sortbamfile,
                                            inex=opt$counting_opts$introns,
                                            cores=opt$num_threads,
                                            samtoolsexc=samtoolsexc,
                                            allC=allC,
                                            saf=saf)
    saveRDS(genesWithIntronProb, file = paste0(opt$out_dir,"/zUMIs_output/stats/",opt$project,".intronProbability.rds"))
  }
}

#demultiplexing
if(opt$counting_opts$Ham_Dist == 0 && opt$barcodes$demultiplex == TRUE ){ #otherwise its already demultiplexed!
  print("Demultiplexing output bam file by cell barcode...")
  demultiplex_bam(opt, sortbamfile, nBCs = length(unique(bccount$XC)))
Loading