fastq merging (a6ddb900) · Commits · github_fork / ZUMIs

README.md

+2 −0

Original line number	Diff line number	Diff line
		@@ -29,6 +29,8 @@ We provide a script to convert zUMIs output into loom file automatically based o
		zUMIs will try to automatically do this, otherwise convert zUMIs output to loom by simply running `Rscript rds2loom.R myRun.yaml`.

		## Changelog
		18 May 2020: zUMIs2.8.2: Added `merge_demultiplexed_fastq.R` to concatenate previously demultiplexed fastq files. For usage details see here: https://github.com/sdparekh/zUMIs/wiki/Starting-from-demultiplexed-fastq-files

		15 May 2020: zUMIs2.8.1: Smart-seq3 UMI pattern detection now takes one hamming distance error into account.

		02 May 2020: zUMIs2.8.0: Ensure correct behaviour when resuming from mapping (`which_stage: Mapping`). It is now possible to fractionally count reads overlapping multiple features (on the same strand if strand-specificity is used) `multi_overlap: yes`. New function for parsing the gene annotations: Fixed a problem with genes completely within other another gene's intron & added the possibility to extend genomic interval of exons (this is useful for crossmapping species with poor annotations). Added a probability calculation to compare the occurance of intronic reads to an expectation generated from background intergenic mapping reads (`intronProb: yes`). Added statistics output for the total number of reads observed per gene.

misc/merge_demultiplexed_fastq.R

0 → 100644

+73 −0

Original line number	Diff line number	Diff line
		library(stringi)
		library(optparse)
		library(data.table)
		setDTthreads(1)

		option_list <- list(
		make_option(c("-d", "--dir"), type="character",
		help="Directory with fastq files. Mandatory"),
		make_option(c("-p", "--pigz"), type="character",
		help="Executable for pigz. Default: pigz.", default = "pigz"),
		make_option(c("-t","--threads"), type="integer",
		help="Number of threads to use. Default: 24",
		default=24)
		)
		opt <- parse_args(OptionParser(option_list=option_list))

		fastq_directory <- opt$dir
		#fastq_directory <- "~/projects/HEK_fixation/fastq/"
		fastq_files <- list.files(path = fastq_directory, pattern = ".[fastq\|fq].gz")

		### first check if the fastq file names are bcl2fastq style or SRA style:
		num_files_sra <- sum(grepl(pattern = '_[1-2].fastq.gz', fastq_files))
		num_files_bcl <- sum(grepl(pattern = '_R[1-2]_', fastq_files))

		if(num_files_bcl >= num_files_sra){
		style <- 'bcl2fastq'
		file_delim_r1 <- "_R1_"
		file_delim_r2 <- "_R2_"
		}else{
		style <- 'SRA'
		file_delim_r1 <- "_1.fastq.gz"
		file_delim_r2 <- "_2.fastq.gz"
		}
		print(paste("Detected files to be in",style,"format."))


		read1_files <- grep(file_delim_r1, fastq_files, value = TRUE)
		read2_files <- grep(file_delim_r2, fastq_files, value = TRUE)

		#check if SE or PE data
		if(length(read2_files) == 0){
		mode <- "SE"
		}else{
		mode <- "PE"
		}

		#terminate if there is no data!
		if(length(read1_files) == 0){
		print("No valid fastq files found!")
		stop()
		}

		samples <- data.table(r1 = read1_files)
		if(mode == "PE") samples[,r2 := read2_files]
		samples[, sample := tstrsplit(r1, file_delim_r1, keep = 1)][
		, BC := stringi::stri_rand_strings(.N, 8, pattern = "[A-Z]")]

		outfile_r1 <- paste0(opt$dir,"/reads_for_zUMIs.R1.fastq")
		outfile_r2 <- paste0(opt$dir,"/reads_for_zUMIs.R2.fastq")
		outfile_index <- paste0(opt$dir,"/reads_for_zUMIs.index.fastq")

		for(i in seq(nrow(samples))){
		system(paste("zcat", paste(opt$dir,samples[i]$r1,sep = "/"), ">>", outfile_r1))
		if(mode == "PE") system(paste("zcat", paste(opt$dir,samples[i]$r2,sep = "/"), ">>", outfile_r2))
		system(paste0("zcat ", paste(opt$dir,samples[i]$r1,sep = "/"), " \| awk -v bc=\"",samples[i]$BC,"\" '{i++;if(i==1 \|\| i==3){print;}if(i==2){print bc;}if(i==4){i=0;print \"AAAAAAAA\";}}' >> ",outfile_index))
		}

		system(paste(opt$pigz,"-p",opt$threads,outfile_r1))
		if(mode == "PE") system(paste(opt$pigz,"-p",opt$threads,outfile_r2))
		system(paste(opt$pigz,"-p",opt$threads,outfile_index))

		fwrite(samples, file = paste0(opt$dir,"/reads_for_zUMIs.samples.txt"), quote = F, sep = "\t")
		write(samples$BC, file = paste0(opt$dir,"/reads_for_zUMIs.expected_barcodes.txt"))

zUMIs-master.sh

+1 −1

Original line number	Diff line number	Diff line
		@@ -3,7 +3,7 @@
		# Pipeline to run UMI-seq analysis from fastq to read count tables.
		# Authors: Swati Parekh, Christoph Ziegenhain, Beate Vieth & Ines Hellmann
		# Contact: sparekh@age.mpg.de or christoph.ziegenhain@ki.se
		vers=2.8.1
		vers=2.8.2
		currentv=`curl -s https://raw.githubusercontent.com/sdparekh/zUMIs/master/zUMIs-master.sh \| grep '^vers=' \| cut -f2 -d "="`
		if [ "$currentv" != "$vers" ]; then echo -e "------------- \n\n Good news! A newer version of zUMIs is available at https://github.com/sdparekh/zUMIs \n\n-------------"; fi

Admin message