Janitorial: make indentation / bracing / comment layout consistent; remove dead comments (74f81d46) · Commits · github_fork / ZUMIs

fqfilter_v2.pl

+175 −166

Original line number	Diff line number	Diff line
		#!/usr/bin/perl
		#!/usr/bin/env perl
		use warnings;
		use strict;
		# LMU Munich. AG Enard
		# A script to filter reads based on Barcode base quality.

		if(@ARGV != 6)
		{
		if (@ARGV != 6) {
		print
		"\n#####################################################################################
		Usage: perl $0 <yaml> <samtools-executable> <rscript-executable> <pigz-executable> <zUMIs-dir> <tmpPrefix>\n
		Please drop your suggestions and clarifications to <sparekh\@age.mpg.de>\n
		######################################################################################\n\n";
		"\n#####################################################################################".
		"Usage: perl $0 <yaml> <samtools-executable> <rscript-executable> <pigz-executable> <zUMIs-dir> <tmpPrefix>\n".
		"Please drop your suggestions and clarifications to <sparekh\@age.mpg.de>\n".
		"######################################################################################\n\n";
		exit;
		}

		BEGIN {
		## TODO: use `(a,b,c) = @ARGV`
		$yml=$ARGV[0];
		$samtoolsexc=$ARGV[1];
		$rscriptexc=$ARGV[2];
		@@ -19,6 +22,7 @@ $pigz=$ARGV[3];
		$zumisdir=$ARGV[4];
		$tmpPrefix=$ARGV[5];
		}

		use lib "$zumisdir";
		use distilReads;
		use Approx;
		@@ -27,14 +31,15 @@ open(YL,"$rscriptexc $zumisdir/readYaml4fqfilter.R $yml \|");
		@arg=<YL>;
		close YL;
		%argHash;
		@params=("filenames", "seqtype", "outdir", "StudyName", "num_threads", "BCfilter", "UMIfilter", "find_pattern", "correct_frameshift");

		@params=("filenames", "seqtype", "outdir", "StudyName", "num_threads",
		"BCfilter", "UMIfilter", "find_pattern", "correct_frameshift");

		for ($i=0;$i<=$#params;$i++) {
		$argHash{$params[$i]}=$arg[$i];
		}

		# parse the fastqfiles and make a hash with file handles as key and filename.pattern as value
		# TODO: convert to `map {$_ = distilReads::argClean($argHash{$_}); chomp} @params`, or similar
		$f = distilReads::argClean($argHash{"filenames"});
		$st = distilReads::argClean($argHash{"seqtype"});
		$outdir = distilReads::argClean($argHash{"outdir"});
		@@ -45,10 +50,6 @@ $UMIfilter = distilReads::argClean($argHash{"UMIfilter"});
		$pattern = distilReads::argClean($argHash{"find_pattern"});
		$frameshift = distilReads::argClean($argHash{"correct_frameshift"});


		#/data/share/htp/Project_mcSCRB-seqPaper/PEG_May2017/demult_HEK_r1.fq.gz; /data/share/htp/Project_mcSCRB-seqPaper/PEG_May2017/demult_HEK_r2.fq.gz;ACTGCTGTA
		#if find_pattern exists, readYaml4fqfilter returns "ATTGCGCAATG character(0) character(0)"

		chomp($f);
		chomp($st);
		chomp($outdir);
		@@ -80,15 +81,15 @@ for($i=0;$i<=$#keys;$i++){

		#change the file name to temporary prefix for its chunk
		$chunk = "$outdir/zUMIs_output/.tmpMerge/$oriBase$tmpPrefix.gz";
		open $fh, '-\|', $pigz, '-dc', $chunk \|\| die "Couldn't open file ".$chunk.". Check permissions!\n Check if it is differently zipped then .gz\n\n";
		open $fh, '-\|', $pigz, '-dc', $chunk \|\| die "Couldn't open file ".$chunk.
		". Check permissions!\n Check if it is differently zipped then .gz\n\n";
		} else {

		$oriF = $fp[0];
		$oriBase = `basename $oriF'`;
		#change the file name to temporary prefix for its chunk
		$chunk = "$outdir/zUMIs_output/.tmpMerge/$oriBase$tmpPrefix.gz";

		open $fh, "<", $chunk \|\| die "Couldn't open file ".$chunk.". Check permissions!\n Check if it is differently zipped then .gz\n\n";
		open $fh, "<", $chunk \|\| die "Couldn't open file ".$chunk.
		". Check permissions!\n Check if it is differently zipped then .gz\n\n";
		}
		}

		@@ -114,18 +115,18 @@ while(<$fh1>){
		$p3 = $fp1[3];
		$ss3 = "yespattern";
		#$flag = 0;
		#This block checks if the read should have certian pattern
		#This block checks if the read should have certain pattern
		if ($p2 =~ /^character/) {
		$mcrseq = $rseq;
		$checkpattern = $rseq;
		}
		else{
		} else {
		$mcrseq = $rseq;
		$checkpattern = $p2;
		}

		# This block checks if smart-seq3 pattern is present and if it is found in the reads
		# If it is smart-seq3 pattern in the YAML file but not found in the read then the read is retained as full cDNA read where UMI is null.
		# If it is smart-seq3 pattern in the YAML file but not found in the read then the read
		# is retained as full cDNA read where UMI is null.
		if ($p2 eq "ATTGCGCAATG") {
		$a = substr($mcrseq,0,length($p2));
		if (Approx::amatch($checkpattern, [ 1 ],$a)) {
		@@ -137,8 +138,6 @@ while(<$fh1>){
		}
		}



		#This block checks if the read should be read corrected for frameshift in BC pattern
		if ($p3 !~ /^character/) {
		@bla = split($p3,$rseq);
		@@ -160,16 +159,17 @@ while(<$fh1>){
		$count=1;
		$phredoffset = distilReads::checkPhred($qseq);
		}
		($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2, $cdc, $lay) = ("","","","","","","","",0,"SE");
		($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2, $cdc, $lay) =
		("","","","","","","","",0,"SE");

		if ($isPass ne "fail") {
		($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2, $cdc, $lay) = distilReads::makeSeqs($rseq,$qseq,$p1,$cdc,$ss3);
		($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2, $cdc, $lay) =
		distilReads::makeSeqs($rseq,$qseq,$p1,$cdc,$ss3);
		}

		for ($i=1;$i<=$#keys;$i++) {
		$fh = $keys[$i];
		@fp = split(":",$file_handles{$fh});
		# $flag = 0;
		$rid1=<$fh>;
		$rseq1=<$fh>;
		$qid1=<$fh>;
		@@ -183,14 +183,14 @@ while(<$fh1>){
		if ($pf =~ /^character/) {
		$mcrseq = $rseq1;
		$checkpattern = $rseq1;
		}
		else{
		} else {
		$mcrseq = $rseq1;
		$checkpattern = $pf;
		}

		# This block checks if smart-seq3 pattern is present and if it is found in the reads
		# If it is smart-seq3 pattern in the YAML file but not found in the read then the read is retained as full cDNA read where UMI is null.
		# If it is smart-seq3 pattern in the YAML file but not found in the read then the read
		# is retained as full cDNA read where UMI is null.
		if ($pf eq "ATTGCGCAATG") {
		$af = substr($mcrseq,0,length($pf));
		if (Approx::amatch($checkpattern, [ 1 ],$af)) {
		@@ -229,15 +229,17 @@ while(<$fh1>){

		# get the BC, UMI and cDNA sequences from all the given fastq files and concatenate according to given ranges
		if ($isPass ne "fail") {
		($bcseq1, $bcqseq1, $ubseq1, $ubqseq1, $cseq1, $cqseq1, $cseq2, $cqseq2, $cdc, $lay) = distilReads::makeSeqs($rseq1,$qseq1,$p,$cdc,$ss3);
		}
		else
		{
		($bcseq1, $bcqseq1, $ubseq1, $ubqseq1, $cseq1, $cqseq1, $cseq2, $cqseq2, $cdc, $lay) = ("","","","","","","","",0,"SE");
		($bcseq1, $bcqseq1, $ubseq1, $ubqseq1, $cseq1, $cqseq1, $cseq2, $cqseq2, $cdc, $lay) =
		distilReads::makeSeqs($rseq1,$qseq1,$p,$cdc,$ss3);
		} else {
		($bcseq1, $bcqseq1, $ubseq1, $ubqseq1, $cseq1, $cqseq1, $cseq2, $cqseq2, $cdc, $lay) =
		("","","","","","","","",0,"SE");
		}

		# concatenate according to given ranges with other files
		($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2) = ($bcseq.$bcseq1, $bcqseq.$bcqseq1, $ubseq.$ubseq1, $ubqseq.$ubqseq1, $cseqr1.$cseq1, $cqseqr1.$cqseq1, $cseqr2.$cseq2, $cqseqr2.$cqseq2);
		($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2) =
		($bcseq.$bcseq1, $bcqseq.$bcqseq1, $ubseq.$ubseq1, $ubqseq.$ubqseq1,
		$cseqr1.$cseq1, $cqseqr1.$cqseq1, $cseqr2.$cseq2, $cqseqr2.$cqseq2);
		}
		#next if($flag = 1); # if correct_Frame is present and the pattern is not found in the read

		@@ -257,8 +259,13 @@ while(<$fh1>){
		$btmp = grep {$_ < $bcthres[1]} @bquals;
		$mtmp = grep {$_ < $umithres[1]} @mquals;

		## Check if it is a smartseq3 pattern. If so, allow for approximate match with 1 base distance. If it is not a smartseq3 pattern, check if the read starts with the pattern at all. The $goahead variable decides if the read passes the quality threshold of a pattern.
		# IF the read should not have any pattern, the $checkpattern is equal to $mcrseq so $goahead variable will stay "yes"
		## Check if it is a smartseq3 pattern. If so, allow for approximate
		## match with 1 base distance. If it is not a smartseq3 pattern,
		## check if the read starts with the pattern at all. The $goahead
		## variable decides if the read passes the quality threshold of a
		## pattern.
		# IF the read should not have any pattern, the $checkpattern is
		# equal to $mcrseq so $goahead variable will stay "yes"
		if ($checkpattern eq "ATTGCGCAATG") {
		$ac = substr($mcrseq,0,length($checkpattern));
		if (Approx::amatch($checkpattern, [ 1 ],$ac)) {
		@@ -274,18 +281,14 @@ while(<$fh1>){
		}
		}
		# print out only if above the quality threshold

		#if(($btmp < $bcthres[0]) && ($mtmp < $umithres[0]) && ( Approx::amatch($checkpattern, [ 1 ],$mcrseq) ) && ($isPass ne "fail")){
		#if(($btmp < $bcthres[0]) && ($mtmp < $umithres[0]) && ($mcrseq =~ m/^$checkpattern/) && ($isPass ne "fail")){
		#if(($btmp < $bcthres[0]) && ($mtmp < $umithres[0])){
		# TODO: convert string checks to logical comparisons
		if (($btmp < $bcthres[0]) && ($mtmp < $umithres[0]) && ($goahead eq "yes") && ($isPass ne "fail")) {
		chomp($rid);

		if ($rid =~ m/^\@.*\s/) {
		$rid =~ m/^\@(.*)\s/;
		$ridtmp = $1;
		}
		else{
		} else {
		$rid =~ m/^\@(.*)/;
		$ridtmp = $1;
		}
		@@ -294,15 +297,21 @@ while(<$fh1>){
		$bclist{$bcseq}++;
		#print $lay,"\n";
		if ($lay eq "SE") {
		print BCBAM $ridtmp,"\t4\t\t0\t0\t\t*\t0\t0\t",$cseqr1,"\t",$cqseqr1,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
		print BCBAM $ridtmp,"\t4\t\t0\t0\t\t*\t0\t0\t",
		$cseqr1,"\t",$cqseqr1,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
		} else {
		print BCBAM $ridtmp,"\t77\t\t0\t0\t\t*\t0\t0\t",$cseqr1,"\t",$cqseqr1,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
		print BCBAM $ridtmp,"\t141\t\t0\t0\t\t*\t0\t0\t",$cseqr2,"\t",$cqseqr2,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
		print BCBAM $ridtmp,"\t77\t\t0\t0\t\t*\t0\t0\t",
		$cseqr1,"\t",$cqseqr1,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
		print BCBAM $ridtmp,"\t141\t\t0\t0\t\t*\t0\t0\t",
		$cseqr2,"\t",$cqseqr2,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
		}
		}
		}

		close BCBAM;
		for($i=0;$i<=$#keys;$i++){ close $keys[$i]; }
		for ($i=0;$i<=$#keys;$i++) {
		close $keys[$i];
		}

		open BCOUT, '>', $outbcstats \|\| die "Couldn't open file ".$outbcstats.". Check permissions!\n";
		foreach $bc (keys %bclist) {

Admin message