Commit a6ddb900 authored by cziegenhain's avatar cziegenhain
Browse files

fastq merging

parent e105a3ad
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -29,6 +29,8 @@ We provide a script to convert zUMIs output into loom file automatically based o
zUMIs will try to automatically do this, otherwise convert zUMIs output to loom by simply running `Rscript rds2loom.R myRun.yaml`.

## Changelog
18 May 2020: zUMIs2.8.2: Added `merge_demultiplexed_fastq.R` to concatenate previously demultiplexed fastq files. For usage details see here: https://github.com/sdparekh/zUMIs/wiki/Starting-from-demultiplexed-fastq-files

15 May 2020: zUMIs2.8.1: Smart-seq3 UMI pattern detection now takes one hamming distance error into account.

02 May 2020: zUMIs2.8.0: Ensure correct behaviour when resuming from mapping (`which_stage: Mapping`). It is now possible to fractionally count reads overlapping multiple features (on the same strand if strand-specificity is used) `multi_overlap: yes`. New function for parsing the gene annotations: Fixed a problem with genes completely within other another gene's intron & added the possibility to extend genomic interval of exons (this is useful for crossmapping species with poor annotations). Added a probability calculation to compare the occurance of intronic reads to an expectation generated from background intergenic mapping reads (`intronProb: yes`). Added statistics output for the total number of reads observed per gene.
+73 −0
Original line number Diff line number Diff line
library(stringi)
library(optparse)
library(data.table)
setDTthreads(1)

option_list <- list(
  make_option(c("-d", "--dir"), type="character",
              help="Directory with fastq files. Mandatory"),
  make_option(c("-p", "--pigz"), type="character",
              help="Executable for pigz. Default: pigz.", default = "pigz"),
  make_option(c("-t","--threads"), type="integer",
                help="Number of threads to use. Default: 24",
              default=24)
)
opt <- parse_args(OptionParser(option_list=option_list))

fastq_directory <- opt$dir
#fastq_directory <- "~/projects/HEK_fixation/fastq/"
fastq_files <- list.files(path = fastq_directory, pattern = ".[fastq|fq].gz")

### first check if the fastq file names are bcl2fastq style or SRA style:
num_files_sra <- sum(grepl(pattern = '_[1-2].fastq.gz', fastq_files))
num_files_bcl <- sum(grepl(pattern = '_R[1-2]_', fastq_files))

if(num_files_bcl >= num_files_sra){
  style <- 'bcl2fastq'
  file_delim_r1 <- "_R1_"
  file_delim_r2 <- "_R2_"
}else{
  style <- 'SRA'
  file_delim_r1 <- "_1.fastq.gz"
  file_delim_r2 <- "_2.fastq.gz"
}
print(paste("Detected files to be in",style,"format."))


read1_files <- grep(file_delim_r1, fastq_files, value = TRUE)
read2_files <- grep(file_delim_r2, fastq_files, value = TRUE)

#check if SE or PE data
if(length(read2_files) == 0){
  mode <- "SE"
}else{
  mode <- "PE"
}

#terminate if there is no data!
if(length(read1_files) == 0){
  print("No valid fastq files found!")
  stop()
}

samples <- data.table(r1 = read1_files)
if(mode == "PE") samples[,r2 := read2_files]
samples[, sample := tstrsplit(r1, file_delim_r1, keep = 1)][
        , BC := stringi::stri_rand_strings(.N, 8, pattern = "[A-Z]")]

outfile_r1 <- paste0(opt$dir,"/reads_for_zUMIs.R1.fastq")
outfile_r2 <- paste0(opt$dir,"/reads_for_zUMIs.R2.fastq")
outfile_index <- paste0(opt$dir,"/reads_for_zUMIs.index.fastq")

for(i in seq(nrow(samples))){
  system(paste("zcat", paste(opt$dir,samples[i]$r1,sep = "/"), ">>", outfile_r1))
  if(mode == "PE") system(paste("zcat", paste(opt$dir,samples[i]$r2,sep = "/"), ">>", outfile_r2))
  system(paste0("zcat ", paste(opt$dir,samples[i]$r1,sep = "/"), " | awk -v bc=\"",samples[i]$BC,"\" '{i++;if(i==1 || i==3){print;}if(i==2){print bc;}if(i==4){i=0;print \"AAAAAAAA\";}}' >> ",outfile_index))
}

system(paste(opt$pigz,"-p",opt$threads,outfile_r1))
if(mode == "PE") system(paste(opt$pigz,"-p",opt$threads,outfile_r2))
system(paste(opt$pigz,"-p",opt$threads,outfile_index))

fwrite(samples, file =  paste0(opt$dir,"/reads_for_zUMIs.samples.txt"), quote = F, sep = "\t")
write(samples$BC, file = paste0(opt$dir,"/reads_for_zUMIs.expected_barcodes.txt"))
+1 −1
Original line number Diff line number Diff line
@@ -3,7 +3,7 @@
# Pipeline to run UMI-seq analysis from fastq to read count tables.
# Authors: Swati Parekh, Christoph Ziegenhain, Beate Vieth & Ines Hellmann
# Contact: sparekh@age.mpg.de or christoph.ziegenhain@ki.se
vers=2.8.1
vers=2.8.2
currentv=`curl -s https://raw.githubusercontent.com/sdparekh/zUMIs/master/zUMIs-master.sh | grep '^vers=' | cut -f2 -d "="`
if [ "$currentv" != "$vers" ]; then echo -e "------------- \n\n Good news! A newer version of zUMIs is available at https://github.com/sdparekh/zUMIs \n\n-------------"; fi