Commit 4e23bf92 authored by cziegenhain's avatar cziegenhain
Browse files

2.9.4 UB tag write speed

parent 18c89335
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -31,6 +31,8 @@ We provide a script to convert zUMIs output into loom file automatically based o
zUMIs will try to automatically do this, otherwise convert zUMIs output to loom by simply running `Rscript rds2loom.R myRun.yaml`.

## Changelog
12 Sept 2020: [zUMIs2.9.4](https://github.com/sdparekh/zUMIs/releases/tag/2.9.4): Speed writing of error-corrected UMI tags to bam file up significantly. Prevent potential crash when no cells meet any user-defined downsampling criteria. 

19 July 2020: [zUMIs2.9.3](https://github.com/sdparekh/zUMIs/releases/tag/2.9.3): Add zUMIs version number to header of unmapped bam files. Several bug fixes: prevent error during mapping with memory handling; incorrect Smart-seq3 UMI-fragment counting.

14 July 2020: [zUMIs2.9.2](https://github.com/sdparekh/zUMIs/releases/tag/2.9.2): Several bug fixes: Prevent RAM from ballooning, issues with resuming from different stage. Speed up demultiplexing further by chrosome-wise operations. Remove need for second bam file sorting after hamming collapse by keeping sort order.
+50 −34
Original line number Diff line number Diff line
@@ -334,6 +334,22 @@ write_molecule_mapping <- function(mm){
  }
}

correct_UB_tags_new <- function(inbamfile,n){
  mm_path <- paste0(opt$out_dir,"/zUMIs_output/molecule_mapping/",n,".")
  outbamfile <-paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.UBcorrected.sorted.bam")
  bcpath <- paste0(opt$out_dir,"/zUMIs_output/",opt$project,"kept_barcodes_binned.txt")
  use_threads <- opt$num_threads
  pypath <- paste0(opt$zUMIs_directory,"/correct_UBtag.py")
  UBcmd <- paste("python3", pypath,
                 "--bam",inbamfile,
                 "--out",outbamfile,
                 "--p",use_threads,
                 "--bcs",bcpath,
                 "--stub",mm_path)
  system(UBcmd)
  return(outbamfile)
}

correct_UB_tags <- function(bccount, samtoolsexc){
  mm_path <- paste0(opt$out_dir,"/zUMIs_output/molecule_mapping/")
  demux_path <- paste0(opt$out_dir,"/zUMIs_output/demultiplexed/")
+12 −8
Original line number Diff line number Diff line
@@ -33,6 +33,10 @@ setDownSamplingOption<-function( down ,bccount, filename=NULL){
  }
  colnames(subsample.splits)<-c("minR","maxR")

  if(nrow(subsample.splits) == 0){
    subsample.splits <- setDownSamplingOption(down = "0", bccount = bccount, filename = filename)
  }
  
  return( subsample.splits )
}

correct_UBtag.py

0 → 100644
+136 −0
Original line number Diff line number Diff line
#!/usr/bin/env python3
import os
import pysam
import argparse
import multiprocessing as mp

def collect_bam_chunks(inpath, chrs, outpath):
    allpaths = [inpath+".tmp."+c+".bam" for c in chrs[:-1]]
    allpaths.append(inpath+".tmp."+"unmapped"+".bam")
    cat_args = ['-o', outpath]+allpaths
    pysam.cat(*cat_args)
    x = [os.remove(f) for f in allpaths]
    #pysam.index(outpath)

def load_bcs(bcpath):
    with open(bcpath) as f:
        x = f.readline() # remove header
        y = f.readlines()
        bc = []
        for l in y:
            l = l.split(',')
            bc.append(l[0])
    return(bc)

def load_dict(stub, bcs):
    molecules_dict = {}
    for i in bcs:
        fp = stub+i+".txt"
        if os.path.exists(fp):
            molecules_dict[i] = {}
            with open(fp) as f:
              x = f.readline() # remove header
              y = f.readlines()
              for l in y:
                l = l.strip().split('\t')
                if l[3] not in molecules_dict[i]:
                  molecules_dict[i][l[3]] = {}
                if l[0] not in molecules_dict[i][l[3]]:
                  molecules_dict[i][l[3]][l[0]] = {}
                molecules_dict[i][l[3]][l[0]] = l[1]
    return(molecules_dict)

# def return_UB(moldict, BC, GE, UX):
#     UB = UX
#     if BC in moldict:
#         if GE in moldict[BC]:
#             if UX in moldict[BC][GE]:
#                 UB = moldict[BC][GE][UX]
#     return(UB)

def return_UB(moldict, BC, GE, UX):
    try:
        UB = moldict[BC][GE][UX]
    except KeyError:
        UB = UX
    return(UB)

def correct_tags(inpath, threads, chr):
    global mols
    #nreads = 0
    if chr == '*':
        chrlabel = 'unmapped'
    else:
        chrlabel = chr
    outpath = inpath+".tmp."+chrlabel+".bam"
    inp = pysam.AlignmentFile(inpath, 'rb', threads = threads)
    out = pysam.AlignmentFile(outpath, 'wb', template = inp, threads = threads)
    for read in inp.fetch(chr):
        #nreads += 1
        umi = read.get_tag('UB')
        cell = read.get_tag('BC')
        if read.has_tag('GE'):
            gene = read.get_tag('GE')
        else:
            gene = 'NA'
        read.set_tag(tag = 'UX', value = umi, value_type = 'Z')
        umi_new = return_UB(moldict = mols, BC = cell, GE = gene, UX = umi)
        read.set_tag(tag = 'UB', value = umi_new, value_type = 'Z')
        out.write(read)
    inp.close()
    out.close()
    #print("Number of reads processed: "+nreads)

def main():
    parser = argparse.ArgumentParser(add_help=True)
    parser.add_argument('--bam', type=str, metavar='FILENAME',
                        help='Path to input BAM file')
    parser.add_argument('--out', type=str, metavar='FILENAME',
                        help='Path to output bam file')
    parser.add_argument('--p', type=int, default = 10,
                        help='Number of processes for bams')
    parser.add_argument('--bcs', type=str, metavar='FILENAME',
                        help='Path to kept barcodes')
    parser.add_argument('--stub', type=str, metavar='FILENAME',
                        help='Molecule table path stub')

    args = parser.parse_args()

####
#bcs = load_bcs('zUMIs_output/hSkinkept_barcodes.txt')
#mols = load_dict('zUMIs_output/molecule_mapping/hSkin.', bcs)
#bam = 'hSkin.filtered.Aligned.GeneTagged.sorted.bam'
#chrs = pysam.idxstats(bam).split('\n')
#chrs = [c.split('\t')[0] for c in chrs[:-1]]
#pysam_workers = 3
#n_jobs = 10
#pool = mp.Pool(n_jobs)
#results = [pool.apply_async(correct_tags, (bam,pysam_workers,chr, )) for chr in chrs]
####


    bcs = load_bcs(args.bcs)
    print("Loading molecule correction dictionary...")
    global mols
    mols = load_dict(args.stub, bcs)
    print("Correcting UB tags...")

    chrs = pysam.idxstats(args.bam).split('\n')
    chrs = [c.split('\t')[0] for c in chrs[:-1]]

    if args.p > 8:
        pysam_workers = 2
        n_jobs = int(args.p/2)
    else:
        pysam_workers = 1
        n_jobs = args.p

    pool = mp.Pool(n_jobs)
    results = [pool.apply_async(correct_tags, (args.bam,pysam_workers,chr, )) for chr in chrs]
    x = [r.get() for r in results]


    collect_bam_chunks(inpath = args.bam, chrs = chrs, outpath = args.out)

if __name__ == "__main__":
    main()
+6 −4
Original line number Diff line number Diff line
@@ -161,10 +161,12 @@ if(opt$counting_opts$Ham_Dist == 0){
    reads <- reads[!UB==""] #make sure only UMI-containing reads go further
    u <- umiCollapseHam(reads,bccount, HamDist=opt$counting_opts$Ham_Dist)
  }
  print("Demultiplexing output bam file by cell barcode...")
  demultiplex_bam(opt, outbamfile, nBCs = length(unique(bccount$XC)), bccount = bccount, samtoolsexc = samtoolsexc)
  #print("Demultiplexing output bam file by cell barcode...")
  #demultiplex_bam(opt, outbamfile, nBCs = length(unique(bccount$XC)), bccount = bccount, samtoolsexc = samtoolsexc)
  print("Correcting UMI barcode tags...")
  sortbamfile <- correct_UB_tags(bccount, samtoolsexc)
  sortbamfile <- correct_UB_tags_new(outbamfile, opt$project)
  file.remove(outbamfile)
  #sortbamfile <- correct_UB_tags(bccount, samtoolsexc)
  #sortbamfile <-paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.UBcorrected.sorted.bam")
  bccount<-splitRG(bccount=bccount, mem= opt$mem_limit, hamdist = 0) # allow more reads to be in RAM fur subsequent steps
}
@@ -274,7 +276,7 @@ if(opt$counting_opts$intronProb == TRUE){
}

#demultiplexing
if(opt$counting_opts$Ham_Dist == 0 && opt$barcodes$demultiplex == TRUE ){ #otherwise its already demultiplexed!
if(opt$barcodes$demultiplex){
  print("Demultiplexing output bam file by cell barcode...")
  demultiplex_bam(opt, sortbamfile, nBCs = length(unique(bccount$XC)), bccount = bccount, samtoolsexc = samtoolsexc)
}
Loading