revert fqfilter to remove PE bug (2cacbed9) · Commits · github_fork / ZUMIs

README.md

+2 −0

Original line number	Diff line number	Diff line
		@@ -29,6 +29,8 @@ We provide a script to convert zUMIs output into loom file automatically based o
		zUMIs will try to automatically do this, otherwise convert zUMIs output to loom by simply running `Rscript rds2loom.R myRun.yaml`.

		## Changelog
		10 July 2020: [zUMIs2.9.1](https://github.com/sdparekh/zUMIs/releases/tag/2.9.1): Revert changes from pull request #194 (commit #ff7e879) that introduced a bug when using PE reads.

		05 July 2020: [zUMIs2.9.0 released](https://github.com/sdparekh/zUMIs/releases/tag/2.9.0): Speed up STAR read mapping by parallel instances if enough resources are available. Change the main zUMIs script to `zUMIs.sh`. Speed up and reduce clutter by loading reads from bam files using parallelised Rsamtools calls instead of printing temporary text files. Speed up counting by parallelising exon / intron / exon+intron counting as well as downsamplings. Speed up by parallelising creation of wide format count matrices.

		23 June 2020: zUMIs2.8.3: Merged code contribution from @gringer: prevent errors by emitting SAM headers in chunked unmapped .bam file output of fqfilter. Changed call to STAR to prevent stalling of samtools pipe.

distilReads.pm

100755 → 100644

+6 −11

Original line number	Diff line number	Diff line
		@@ -27,10 +27,9 @@ sub makeFileHandles{

		$fh="file".$j;

		if ( open $fhTmp, '<', $file ) {
		$file_handles{ $fh }{"id"} = $file.":".$pat.":".$fpat.":".$cframe;
		$file_handles{ $fh }{"handle"} = $fhTmp;
		close $fhTmp;
		if ( open $fh, '<', $file ) {
		$file_handles{ $fh } = $file.":".$pat.":".$fpat.":".$cframe;
		close $fh;
		}
		else {
		warn "couldn't open $file for reading: $!\n";
		@@ -40,12 +39,8 @@ sub makeFileHandles{
		}

		sub checkPhred{
		my ($bqseq) = @_;
		@quals = map {$_} unpack "C*", $bqseq;
		my $offset = 33;
		if (grep {$_ > 74} @quals) {
		$offset = 64;
		}
		if(grep {$_ > 74} @quals){$offset=64;}else{$offset=33;}
		return $offset;
		}

fqfilter_v2.pl

+205 −237

Original line number	Diff line number	Diff line
		#!/usr/bin/env perl
		use warnings;
		use strict;
		#!/usr/bin/perl
		# LMU Munich. AG Enard
		# A script to filter reads based on Barcode base quality.

		if (@ARGV != 6) {
		if(@ARGV != 6)
		{
		print
		"\n#####################################################################################".
		"Usage: perl $0 <yaml> <samtools-executable> <rscript-executable> <pigz-executable> <zUMIs-dir> <tmpPrefix>\n".
		"Please drop your suggestions and clarifications to <sparekh\@age.mpg.de>\n".
		"######################################################################################\n\n";
		"\n#####################################################################################
		Usage: perl $0 <yaml> <samtools-executable> <rscript-executable> <pigz-executable> <zUMIs-dir> <tmpPrefix>\n
		Please drop your suggestions and clarifications to <sparekh\@age.mpg.de>\n
		######################################################################################\n\n";
		exit;
		}

		my $argLine = join(" ", @ARGV);

		my ($yml, $samtoolsexc, $rscriptexc, $pigz, $zumisdir, $tmpPrefix);
		BEGIN{
		($yml, $samtoolsexc, $rscriptexc, $pigz, $zumisdir, $tmpPrefix) = @ARGV;
		$yml=$ARGV[0];
		$samtoolsexc=$ARGV[1];
		$rscriptexc=$ARGV[2];
		$pigz=$ARGV[3];
		$zumisdir=$ARGV[4];
		$tmpPrefix=$ARGV[5];
		}

		use lib "$zumisdir";
		use distilReads;
		use Approx;

		open(YL,"$rscriptexc $zumisdir/readYaml4fqfilter.R $yml \|");
		my @arg=<YL>;
		@arg=<YL>;
		close YL;
		my %argHash;
		my @params=("filenames", "seqtype", "outdir", "StudyName", "num_threads",
		"BCfilter", "UMIfilter", "find_pattern", "correct_frameshift");
		%argHash;
		@params=("filenames", "seqtype", "outdir", "StudyName", "num_threads", "BCfilter", "UMIfilter", "find_pattern", "correct_frameshift");


		for (my $i=0; $i<=$#params; $i++) {
		for($i=0;$i<=$#params;$i++){
		$argHash{$params[$i]}=$arg[$i];
		}

		# parse the fastqfiles and make a hash with file handles as key and filename.pattern as value
		# TODO: convert to `map {$_ = distilReads::argClean($argHash{$_}); chomp} @params`, or similar
		# or better: use argHash directly
		my ($f, $st, $outdir, $StudyName, $num_threads, $BCfilter, $UMIfilter, $pattern, $frameshift) =
		(distilReads::argClean($argHash{"filenames"}),
		distilReads::argClean($argHash{"seqtype"}),
		distilReads::argClean($argHash{"outdir"}),
		distilReads::argClean($argHash{"StudyName"}),
		distilReads::argClean($argHash{"num_threads"}),
		distilReads::argClean($argHash{"BCfilter"}),
		distilReads::argClean($argHash{"UMIfilter"}),
		distilReads::argClean($argHash{"find_pattern"}),
		distilReads::argClean($argHash{"correct_frameshift"}));
		$f = distilReads::argClean($argHash{"filenames"});
		$st = distilReads::argClean($argHash{"seqtype"});
		$outdir = distilReads::argClean($argHash{"outdir"});
		$StudyName = distilReads::argClean($argHash{"StudyName"});
		$num_threads = distilReads::argClean($argHash{"num_threads"});
		$BCfilter = distilReads::argClean($argHash{"BCfilter"});
		$UMIfilter = distilReads::argClean($argHash{"UMIfilter"});
		$pattern = distilReads::argClean($argHash{"find_pattern"});
		$frameshift = distilReads::argClean($argHash{"correct_frameshift"});


		#/data/share/htp/Project_mcSCRB-seqPaper/PEG_May2017/demult_HEK_r1.fq.gz; /data/share/htp/Project_mcSCRB-seqPaper/PEG_May2017/demult_HEK_r2.fq.gz;ACTGCTGTA
		#if find_pattern exists, readYaml4fqfilter returns "ATTGCGCAATG character(0) character(0)"

		chomp($f);
		chomp($st);
		@@ -58,83 +58,74 @@ chomp($BCfilter);
		chomp($UMIfilter);
		chomp($pattern);
		chomp($frameshift);
		my $isPass="pass";
		$isPass="pass";

		my $outbcstats = "$outdir/zUMIs_output/.tmpMerge/$StudyName.$tmpPrefix.BCstats.txt";
		my $outbam = "$outdir/zUMIs_output/.tmpMerge/$StudyName.$tmpPrefix.filtered.tagged.bam";
		$outbcstats = "$outdir/zUMIs_output/.tmpMerge/$StudyName.$tmpPrefix.BCstats.txt";
		$outbam = "$outdir/zUMIs_output/.tmpMerge/$StudyName.$tmpPrefix.filtered.tagged.bam";

		# Make and open all the file handles
		my %file_handles = distilReads::makeFileHandles($f,$st,$pattern,$frameshift);
		%file_handles = distilReads::makeFileHandles($f,$st,$pattern,$frameshift);

		# get all the filehandles in @keys
		my @keys = sort(keys %file_handles);
		@keys = sort(keys %file_handles);

		for (my $i=0; $i<=$#keys; $i++) {
		my $fhName = $keys[$i];
		my $fh = $file_handles{$fhName}{"handle"};
		my @fp = split(":",$file_handles{$fhName}{"id"});
		for($i=0;$i<=$#keys;$i++){
		$fh = $keys[$i];
		@fp = split(":",$file_handles{$fh});

		if ($fp[0] =~ /\.gz$/) { # check file name
		my $oriF = $fp[0];
		my $oriBase = `basename $oriF .gz`;
		if ($fp[0] =~ /\.gz$/) {
		$oriF = $fp[0];
		$oriBase = `basename $oriF .gz`;
		chomp($oriBase);

		#change the file name to temporary prefix for its chunk
		my $chunk = "$outdir/zUMIs_output/.tmpMerge/$oriBase$tmpPrefix.gz";
		open my $fhTmp, '-\|', $pigz, '-dc', $chunk \|\| die "Couldn't open file ".$chunk.
		". Check permissions!\n Check if it is differently zipped then .gz\n\n";
		$file_handles{$fhName}{"handle"} = $fhTmp;
		$chunk = "$outdir/zUMIs_output/.tmpMerge/$oriBase$tmpPrefix.gz";
		open $fh, '-\|', $pigz, '-dc', $chunk \|\| die "Couldn't open file ".$chunk.". Check permissions!\n Check if it is differently zipped then .gz\n\n";
		}else {
		my $oriF = $fp[0];
		my $oriBase = `basename $oriF'`;

		$oriF = $fp[0];
		$oriBase = `basename $oriF'`;
		#change the file name to temporary prefix for its chunk
		my $chunk = "$outdir/zUMIs_output/.tmpMerge/$oriBase$tmpPrefix.gz";
		open my $fhTmp, "<", $chunk \|\| die "Couldn't open file ".$chunk.
		". Check permissions!\n Check if it is differently zipped then .gz\n\n";
		$file_handles{$fhName}{"handle"} = $fhTmp;
		$chunk = "$outdir/zUMIs_output/.tmpMerge/$oriBase$tmpPrefix.gz";

		open $fh, "<", $chunk \|\| die "Couldn't open file ".$chunk.". Check permissions!\n Check if it is differently zipped then .gz\n\n";
		}
		}

		my $total = 0;
		my $filtered = 0;
		my $discarded = 0;
		my $phredoffset;
		my %bclist;
		$total = 0;
		$filtered = 0;
		%bclist;

		open(BCBAM,"\| $samtoolsexc view -Sb - > $outbam");

		# First file handle to start the while loop for the first file
		my $fhName1 = $keys[0];
		my $fh1 = $file_handles{$fhName1}{"handle"};
		my @fp1 = split(":",$file_handles{$fhName1}{"id"});
		my $count = 0;
		my $printedHeader = 0; # false
		$fh1 = $keys[0];
		@fp1 = split(":",$file_handles{$fh1});
		$count = 0;
		# reading the first file while others are processed in parallel within
		while(<$fh1>){
		$total++;
		my $rid=$_;
		my $rseq=<$fh1>;
		my $qid=<$fh1>;
		my $qseq=<$fh1>;
		my $p1 = $fp1[1];
		my $p2 = $fp1[2];
		my $p3 = $fp1[3];
		my $ss3 = "yespattern";
		my $mcrseq;
		my $checkpattern;
		my $goahead;
		#This block checks if the read should have certain pattern
		$rid=$_;
		$rseq=<$fh1>;
		$qid=<$fh1>;
		$qseq=<$fh1>;
		$p1 = $fp1[1];
		$p2 = $fp1[2];
		$p3 = $fp1[3];
		$ss3 = "yespattern";
		#$flag = 0;
		#This block checks if the read should have certian pattern
		if($p2 =~ /^character/){
		$mcrseq = $rseq;
		$checkpattern = $rseq;
		} else {
		}
		else{
		$mcrseq = $rseq;
		$checkpattern = $p2;
		}

		# This block checks if smart-seq3 pattern is present and if it is found in the reads
		# If it is smart-seq3 pattern in the YAML file but not found in the read then the read
		# is retained as full cDNA read where UMI is null.
		# If it is smart-seq3 pattern in the YAML file but not found in the read then the read is retained as full cDNA read where UMI is null.
		if($p2 eq "ATTGCGCAATG"){
		$a = substr($mcrseq,0,length($p2));
		if(Approx::amatch($checkpattern, [ 1 ],$a)){
		@@ -146,9 +137,12 @@ while (<$fh1>) {
		}
		}



		#This block checks if the read should be read corrected for frameshift in BC pattern
		if($p3 !~ /^character/){
		my @bla = split($p3,$rseq);
		@bla = split($p3,$rseq);
		#next if($#bla != 1);
		if($#bla != 1){
		$isPass = "fail";
		}else{
		@@ -156,7 +150,7 @@ while (<$fh1>) {
		}
		#print $isPass,"\n";
		if($isPass ne "fail"){
		my $qst = length($rseq) - length($isPass);
		$qst = length($rseq) - length($isPass);
		$qseq = substr($qseq, $qst);
		$rseq = $isPass;
		}
		@@ -166,41 +160,39 @@ while (<$fh1>) {
		$count=1;
		$phredoffset = distilReads::checkPhred($qseq);
		}
		my ($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2, $cdc, $lay) =
		("","","","","","","","",0,"SE");
		($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2, $cdc, $lay) = ("","","","","","","","",0,"SE");

		if($isPass ne "fail"){
		($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2, $cdc, $lay) =
		distilReads::makeSeqs($rseq,$qseq,$p1,$cdc,$ss3);
		}

		for (my $i=1; $i<=$#keys; $i++) {
		my $fhName = $keys[$i];
		my $fh = $file_handles{$fhName}{"handle"};
		my @fp = split(":",$file_handles{$fhName}{"id"});
		my $rid1=<$fh>;
		my $rseq1=<$fh>;
		my $qid1=<$fh>;
		my $qseq1=<$fh>;
		my $p = $fp[1];
		my $pf = $fp[2];
		my $pf2 = $fp[3];
		($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2, $cdc, $lay) = distilReads::makeSeqs($rseq,$qseq,$p1,$cdc,$ss3);
		}

		for($i=1;$i<=$#keys;$i++){
		$fh = $keys[$i];
		@fp = split(":",$file_handles{$fh});
		# $flag = 0;
		$rid1=<$fh>;
		$rseq1=<$fh>;
		$qid1=<$fh>;
		$qseq1=<$fh>;
		$p = $fp[1];
		$pf = $fp[2];
		$pf2 = $fp[3];
		$ss3 = "yespattern";

		#This block checks if the read should have certian pattern
		if($pf =~ /^character/){
		$mcrseq = $rseq1;
		$checkpattern = $rseq1;
		} else {
		}
		else{
		$mcrseq = $rseq1;
		$checkpattern = $pf;
		}

		# This block checks if smart-seq3 pattern is present and if it is found in the reads
		# If it is smart-seq3 pattern in the YAML file but not found in the read then the read
		# is retained as full cDNA read where UMI is null.
		# If it is smart-seq3 pattern in the YAML file but not found in the read then the read is retained as full cDNA read where UMI is null.
		if($pf eq "ATTGCGCAATG"){
		my $af = substr($mcrseq,0,length($pf));
		$af = substr($mcrseq,0,length($pf));
		if(Approx::amatch($checkpattern, [ 1 ],$af)){
		$ss3 = "yespattern";
		$checkpattern = $pf;
		@@ -212,7 +204,7 @@ while (<$fh1>) {

		#This block checks if the read should be read corrected for frameshift in BC pattern
		if($pf2 !~ /^character/){
		my @bla = split($pf2,$rseq1);
		@bla = split($pf2,$rseq1);
		#next if($#bla != 1);
		if($#bla != 1){
		$isPass = "fail";
		@@ -221,14 +213,14 @@ while (<$fh1>) {
		}
		#print $isPass,"\n";
		if($isPass ne "fail"){
		my $qst = length($rseq1) - length($isPass);
		$qst = length($rseq1) - length($isPass);
		$qseq1 = substr($qseq1, $qst);
		$rseq1 = $isPass;
		}
		}

		my @c = split(/\/\|\s/,$rid);
		my @b = split(/\/\|\s/,$rid1);
		@c = split(/\/\|\s/,$rid);
		@b = split(/\/\|\s/,$rid1);
		if($c[0] ne $b[0]){
		print $c[0],"\n",$b[0],"\n";
		print "ERROR! Fastq files are not in the same order.\n Make sure to provide reads in the same order.\n\n";
		@@ -236,47 +228,39 @@ while (<$fh1>) {
		}

		# get the BC, UMI and cDNA sequences from all the given fastq files and concatenate according to given ranges
		my ($bcseq1, $bcqseq1, $ubseq1, $ubqseq1, $cseq1, $cqseq1, $cseq2, $cqseq2, $cdc, $lay);
		if($isPass ne "fail"){
		($bcseq1, $bcqseq1, $ubseq1, $ubqseq1, $cseq1, $cqseq1, $cseq2, $cqseq2, $cdc, $lay) =
		distilReads::makeSeqs($rseq1,$qseq1,$p,$cdc,$ss3);
		} else {
		($bcseq1, $bcqseq1, $ubseq1, $ubqseq1, $cseq1, $cqseq1, $cseq2, $cqseq2, $cdc, $lay) =
		("","","","","","","","",0,"SE");
		($bcseq1, $bcqseq1, $ubseq1, $ubqseq1, $cseq1, $cqseq1, $cseq2, $cqseq2, $cdc, $lay) = distilReads::makeSeqs($rseq1,$qseq1,$p,$cdc,$ss3);
		}
		else
		{
		($bcseq1, $bcqseq1, $ubseq1, $ubqseq1, $cseq1, $cqseq1, $cseq2, $cqseq2, $cdc, $lay) = ("","","","","","","","",0,"SE");
		}

		# concatenate according to given ranges with other files
		($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2) =
		($bcseq.$bcseq1, $bcqseq.$bcqseq1, $ubseq.$ubseq1, $ubqseq.$ubqseq1,
		$cseqr1.$cseq1, $cqseqr1.$cqseq1, $cseqr2.$cseq2, $cqseqr2.$cqseq2);
		($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2) = ($bcseq.$bcseq1, $bcqseq.$bcqseq1, $ubseq.$ubseq1, $ubqseq.$ubqseq1, $cseqr1.$cseq1, $cqseqr1.$cqseq1, $cseqr2.$cseq2, $cqseqr2.$cqseq2);
		}
		#next if($flag = 1); # if correct_Frame is present and the pattern is not found in the read

		# Check the quality filter thresholds given
		my @bcthres = split(" ",$BCfilter);
		my @umithres = split(" ",$UMIfilter);
		@bcthres = split(" ",$BCfilter);
		@umithres = split(" ",$UMIfilter);

		if (($bcthres[0] < 0) \|\| (scalar(@bcthres) > length($bcseq))) {
		if(($bcthres[0] < 0) \|\| (length($bcthres) > length($bcseq))) {
		$bcthres[0] = 1;
		}
		if (($umithres[0] < 0) \|\| (scalar(@umithres) > length($ubseq))) {
		if(($umithres[0] < 0) \|\| (length($umithres) > length($ubseq))) {
		$umithres[0] = 1;
		}
		# map to the correct phredoffset and get the number of bases under given quality threshold
		my @bquals = map {$_ - $phredoffset} unpack "C*", $bcqseq;
		my @mquals = map {$_ - $phredoffset} unpack "C*", $ubqseq;
		my $btmp = grep {$_ < $bcthres[1]} @bquals;
		my $mtmp = grep {$_ < $umithres[1]} @mquals;

		## Check if it is a smartseq3 pattern. If so, allow for approximate
		## match with 1 base distance. If it is not a smartseq3 pattern,
		## check if the read starts with the pattern at all. The $goahead
		## variable decides if the read passes the quality threshold of a
		## pattern.
		# IF the read should not have any pattern, the $checkpattern is
		# equal to $mcrseq so $goahead variable will stay "yes"
		@bquals = map {$_ - $phredoffset} unpack "C*", $bcqseq;
		@mquals = map {$_ - $phredoffset} unpack "C*", $ubqseq;
		$btmp = grep {$_ < $bcthres[1]} @bquals;
		$mtmp = grep {$_ < $umithres[1]} @mquals;

		## Check if it is a smartseq3 pattern. If so, allow for approximate match with 1 base distance. If it is not a smartseq3 pattern, check if the read starts with the pattern at all. The $goahead variable decides if the read passes the quality threshold of a pattern.
		# IF the read should not have any pattern, the $checkpattern is equal to $mcrseq so $goahead variable will stay "yes"
		if($checkpattern eq "ATTGCGCAATG"){
		my $ac = substr($mcrseq,0,length($checkpattern));
		$ac = substr($mcrseq,0,length($checkpattern));
		if(Approx::amatch($checkpattern, [ 1 ],$ac)){
		$goahead = "yes";
		}else{
		@@ -290,54 +274,38 @@ while (<$fh1>) {
		}
		}
		# print out only if above the quality threshold
		# TODO: convert string checks to logical comparisons

		#if(($btmp < $bcthres[0]) && ($mtmp < $umithres[0]) && ( Approx::amatch($checkpattern, [ 1 ],$mcrseq) ) && ($isPass ne "fail")){
		#if(($btmp < $bcthres[0]) && ($mtmp < $umithres[0]) && ($mcrseq =~ m/^$checkpattern/) && ($isPass ne "fail")){
		#if(($btmp < $bcthres[0]) && ($mtmp < $umithres[0])){
		if(($btmp < $bcthres[0]) && ($mtmp < $umithres[0]) && ($goahead eq "yes") && ($isPass ne "fail")){
		chomp($rid);
		my $ridtmp;

		if($rid =~ m/^\@.*\s/){
		$rid =~ m/^\@(.*)\s/;
		$ridtmp = $1;
		} else {
		}
		else{
		$rid =~ m/^\@(.*)/;
		$ridtmp = $1;
		}

		$filtered++;
		$bclist{$bcseq}++;
		if(!$printedHeader){
		print(BCBAM join("\t", ("@"."PG","ID:zUMIs-fqfilter","PN:zUMIs-fqfilter", "VN:2",
		"CL:fqfilter_v2.pl ${argLine}")) . "\n");
		$printedHeader = 1; # true
		}
		#print $lay,"\n";
		if($lay eq "SE"){
		print BCBAM $ridtmp,"\t4\t\t0\t0\t\t*\t0\t0\t",
		$cseqr1,"\t",$cqseqr1,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
		print BCBAM $ridtmp,"\t4\t\t0\t0\t\t*\t0\t0\t",$cseqr1,"\t",$cqseqr1,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
		}else{
		print BCBAM $ridtmp,"\t77\t\t0\t0\t\t*\t0\t0\t",
		$cseqr1,"\t",$cqseqr1,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
		print BCBAM $ridtmp,"\t141\t\t0\t0\t\t*\t0\t0\t",
		$cseqr2,"\t",$cqseqr2,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
		print BCBAM $ridtmp,"\t77\t\t0\t0\t\t*\t0\t0\t",$cseqr1,"\t",$cqseqr1,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
		print BCBAM $ridtmp,"\t141\t\t0\t0\t\t*\t0\t0\t",$cseqr2,"\t",$cqseqr2,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
		}
		} else {
		$discarded++;
		}
		}

		close BCBAM;
		for (my $i=0; $i<=$#keys; $i++) {
		my $fhTmp = $file_handles{$keys[$i]}{"handle"};
		close $fhTmp;
		}

		#if ($filtered == 0) {
		# print(STDERR "No reads emitted; please check the parameter file and the warnings above.\n");
		# exit;
		#} else {
		# print(STDERR "reads kept: $filtered; reads discarded: $discarded\n");
		#}
		for($i=0;$i<=$#keys;$i++){ close $keys[$i]; }

		open BCOUT, '>', $outbcstats \|\| die "Couldn't open file ".$outbcstats.". Check permissions!\n";
		foreach my $bc (keys %bclist) {
		foreach $bc (keys %bclist){
		print BCOUT $bc,"\t",$bclist{$bc},"\n";
		}
		close BCOUT;

zUMIs-dge2.R

+5 −0

Original line number	Diff line number	Diff line
		@@ -129,12 +129,14 @@ if(opt$counting_opts$Ham_Dist == 0){

		tmpbamfile <- outbamfile
		outbamfile <- paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.sorted.bam")
		print(Sys.time())
		print("Coordinate sorting intermediate bam file...")
		sort_cmd <- paste0(samtoolsexc," sort -O 'BAM' -@ ",opt$num_threads," -m ",mempercpu,"G -o ",outbamfile," ",tmpbamfile)
		system(sort_cmd)
		index_cmd <- paste(samtoolsexc,"index -@",opt$num_threads,outbamfile)
		system(index_cmd)
		system(paste0("rm ",tmpbamfile))
		print(Sys.time())

		for(i in unique(bccount$chunkID)){
		print( paste( "Hamming distance collapse in barcode chunk", i, "out of",length(unique(bccount$chunkID)) ))
		@@ -152,12 +154,15 @@ if(opt$counting_opts$Ham_Dist == 0){
		outbamfile <- correct_UB_tags(bccount, samtoolsexc)
		sortbamfile <-paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.UBcorrected.sorted.bam")
		}
		print(Sys.time())
		print("Coordinate sorting final bam file...")
		sort_cmd <- paste0(samtoolsexc," sort -O 'BAM' -@ ",opt$num_threads," -m ",mempercpu,"G -o ",sortbamfile," ",outbamfile)
		system(sort_cmd)
		index_cmd <- paste(samtoolsexc,"index -@",opt$num_threads,sortbamfile)
		system(index_cmd)
		system(paste0("rm ",outbamfile))
		print(Sys.time())


		##########################################
		#set Downsampling ranges

zUMIs-mapping.R

+2 −2

Original line number	Diff line number	Diff line
		@@ -110,7 +110,7 @@ if(inp$counting_opts$twoPass==TRUE){
		#finally, run STAR
		if(num_star_instances>1 & inp$which_Stage == "Filtering"){
		map_tmp_dir <- paste0(inp$out_dir,"/zUMIs_output/.tmpMap/")
		dir.create(path = map_tmp_dir)
		dir.create(path = map_tmp_dir,showWarnings = FALSE)
		input_split <- split(filtered_bams, ceiling(seq_along(filtered_bams) / ceiling(length(filtered_bams) / num_star_instances)))
		input_split <- sapply(input_split, paste0, collapse = ",")
		STAR_preset <- STAR_command
		@@ -130,7 +130,7 @@ if(num_star_instances>1 & inp$which_Stage == "Filtering"){
		out_txbams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Aligned.toTranscriptome.out.bam"), full = TRUE)
		merge_txbams <- paste(inp$samtools_exec,"cat -o",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.toTranscriptome.out.bam"),paste(out_txbams, collapse = " "))
		system(paste(merge_logs,"&",merge_bams,"&",merge_txbams,"& wait"))
		system(paste0("rm ", map_tmp_dir, "tmp.", inp$project, ".*"))
		system(paste0("rm -r ", map_tmp_dir, "tmp.", inp$project, ".*"))
		}else{
		STAR_command <- paste(STAR_command,
		"--readFilesIn",paste0(filtered_bams,collapse=","),

Admin message