Janitorial: fix up compiler warnings; add BAM file header (f10d7eb2) · Commits · github_fork / ZUMIs

distilReads.pm

+11 −6

Original line number	Diff line number	Diff line
		@@ -27,9 +27,10 @@ sub makeFileHandles{

		$fh="file".$j;

		if ( open $fh, '<', $file ) {
		$file_handles{ $fh } = $file.":".$pat.":".$fpat.":".$cframe;
		close $fh;
		if ( open $fhTmp, '<', $file ) {
		$file_handles{ $fh }{"id"} = $file.":".$pat.":".$fpat.":".$cframe;
		$file_handles{ $fh }{"handle"} = $fhTmp;
		close $fhTmp;
		}
		else {
		warn "couldn't open $file for reading: $!\n";
		@@ -39,8 +40,12 @@ sub makeFileHandles{
		}

		sub checkPhred{
		my ($bqseq) = @_;
		@quals = map {$_} unpack "C*", $bqseq;
		if(grep {$_ > 74} @quals){$offset=64;}else{$offset=33;}
		my $offset = 33;
		if (grep {$_ > 74} @quals) {
		$offset = 64;
		}
		return $offset;
		}

fqfilter_v2.pl

+109 −86

Original line number	Diff line number	Diff line
		@@ -13,14 +13,11 @@ if (@ARGV != 6) {
		exit;
		}

		my $argLine = join(" ", @ARGV);

		my ($yml, $samtoolsexc, $rscriptexc, $pigz, $zumisdir, $tmpPrefix);
		BEGIN {
		## TODO: use `(a,b,c) = @ARGV`
		$yml=$ARGV[0];
		$samtoolsexc=$ARGV[1];
		$rscriptexc=$ARGV[2];
		$pigz=$ARGV[3];
		$zumisdir=$ARGV[4];
		$tmpPrefix=$ARGV[5];
		($yml, $samtoolsexc, $rscriptexc, $pigz, $zumisdir, $tmpPrefix) = @ARGV;
		}

		use lib "$zumisdir";
		@@ -28,27 +25,29 @@ use distilReads;
		use Approx;

		open(YL,"$rscriptexc $zumisdir/readYaml4fqfilter.R $yml \|");
		@arg=<YL>;
		my @arg=<YL>;
		close YL;
		%argHash;
		@params=("filenames", "seqtype", "outdir", "StudyName", "num_threads",
		my %argHash;
		my @params=("filenames", "seqtype", "outdir", "StudyName", "num_threads",
		"BCfilter", "UMIfilter", "find_pattern", "correct_frameshift");

		for ($i=0;$i<=$#params;$i++) {
		for (my $i=0; $i<=$#params; $i++) {
		$argHash{$params[$i]}=$arg[$i];
		}

		# parse the fastqfiles and make a hash with file handles as key and filename.pattern as value
		# TODO: convert to `map {$_ = distilReads::argClean($argHash{$_}); chomp} @params`, or similar
		$f = distilReads::argClean($argHash{"filenames"});
		$st = distilReads::argClean($argHash{"seqtype"});
		$outdir = distilReads::argClean($argHash{"outdir"});
		$StudyName = distilReads::argClean($argHash{"StudyName"});
		$num_threads = distilReads::argClean($argHash{"num_threads"});
		$BCfilter = distilReads::argClean($argHash{"BCfilter"});
		$UMIfilter = distilReads::argClean($argHash{"UMIfilter"});
		$pattern = distilReads::argClean($argHash{"find_pattern"});
		$frameshift = distilReads::argClean($argHash{"correct_frameshift"});
		# or better: use argHash directly
		my ($f, $st, $outdir, $StudyName, $num_threads, $BCfilter, $UMIfilter, $pattern, $frameshift) =
		(distilReads::argClean($argHash{"filenames"}),
		distilReads::argClean($argHash{"seqtype"}),
		distilReads::argClean($argHash{"outdir"}),
		distilReads::argClean($argHash{"StudyName"}),
		distilReads::argClean($argHash{"num_threads"}),
		distilReads::argClean($argHash{"BCfilter"}),
		distilReads::argClean($argHash{"UMIfilter"}),
		distilReads::argClean($argHash{"find_pattern"}),
		distilReads::argClean($argHash{"correct_frameshift"}));

		chomp($f);
		chomp($st);
		@@ -59,62 +58,71 @@ chomp($BCfilter);
		chomp($UMIfilter);
		chomp($pattern);
		chomp($frameshift);
		$isPass="pass";
		my $isPass="pass";

		$outbcstats = "$outdir/zUMIs_output/.tmpMerge/$StudyName.$tmpPrefix.BCstats.txt";
		$outbam = "$outdir/zUMIs_output/.tmpMerge/$StudyName.$tmpPrefix.filtered.tagged.bam";
		my $outbcstats = "$outdir/zUMIs_output/.tmpMerge/$StudyName.$tmpPrefix.BCstats.txt";
		my $outbam = "$outdir/zUMIs_output/.tmpMerge/$StudyName.$tmpPrefix.filtered.tagged.bam";

		# Make and open all the file handles
		%file_handles = distilReads::makeFileHandles($f,$st,$pattern,$frameshift);
		my %file_handles = distilReads::makeFileHandles($f,$st,$pattern,$frameshift);

		# get all the filehandles in @keys
		@keys = sort(keys %file_handles);
		my @keys = sort(keys %file_handles);

		for ($i=0;$i<=$#keys;$i++) {
		$fh = $keys[$i];
		@fp = split(":",$file_handles{$fh});
		for (my $i=0; $i<=$#keys; $i++) {
		my $fhName = $keys[$i];
		my $fh = $file_handles{$fhName}{"handle"};
		my @fp = split(":",$file_handles{$fhName}{"id"});

		if ($fp[0] =~ /\.gz$/) {
		$oriF = $fp[0];
		$oriBase = `basename $oriF .gz`;
		if ($fp[0] =~ /\.gz$/) { # check file name
		my $oriF = $fp[0];
		my $oriBase = `basename $oriF .gz`;
		chomp($oriBase);

		#change the file name to temporary prefix for its chunk
		$chunk = "$outdir/zUMIs_output/.tmpMerge/$oriBase$tmpPrefix.gz";
		open $fh, '-\|', $pigz, '-dc', $chunk \|\| die "Couldn't open file ".$chunk.
		my $chunk = "$outdir/zUMIs_output/.tmpMerge/$oriBase$tmpPrefix.gz";
		open my $fhTmp, '-\|', $pigz, '-dc', $chunk \|\| die "Couldn't open file ".$chunk.
		". Check permissions!\n Check if it is differently zipped then .gz\n\n";
		$file_handles{$fhName}{"handle"} = $fhTmp;
		} else {
		$oriF = $fp[0];
		$oriBase = `basename $oriF'`;
		my $oriF = $fp[0];
		my $oriBase = `basename $oriF'`;
		#change the file name to temporary prefix for its chunk
		$chunk = "$outdir/zUMIs_output/.tmpMerge/$oriBase$tmpPrefix.gz";
		open $fh, "<", $chunk \|\| die "Couldn't open file ".$chunk.
		my $chunk = "$outdir/zUMIs_output/.tmpMerge/$oriBase$tmpPrefix.gz";
		open my $fhTmp, "<", $chunk \|\| die "Couldn't open file ".$chunk.
		". Check permissions!\n Check if it is differently zipped then .gz\n\n";
		$file_handles{$fhName}{"handle"} = $fhTmp;
		}
		}

		$total = 0;
		$filtered = 0;
		%bclist;
		my $total = 0;
		my $filtered = 0;
		my $discarded = 0;
		my $phredoffset;
		my %bclist;

		open(BCBAM,"\| $samtoolsexc view -Sb - > $outbam");

		# First file handle to start the while loop for the first file
		$fh1 = $keys[0];
		@fp1 = split(":",$file_handles{$fh1});
		$count = 0;
		my $fhName1 = $keys[0];
		my $fh1 = $file_handles{$fhName1}{"handle"};
		my @fp1 = split(":",$file_handles{$fhName1}{"id"});
		my $count = 0;
		my $printedHeader = 0; # false
		# reading the first file while others are processed in parallel within
		while (<$fh1>) {
		$total++;
		$rid=$_;
		$rseq=<$fh1>;
		$qid=<$fh1>;
		$qseq=<$fh1>;
		$p1 = $fp1[1];
		$p2 = $fp1[2];
		$p3 = $fp1[3];
		$ss3 = "yespattern";
		#$flag = 0;
		my $rid=$_;
		my $rseq=<$fh1>;
		my $qid=<$fh1>;
		my $qseq=<$fh1>;
		my $p1 = $fp1[1];
		my $p2 = $fp1[2];
		my $p3 = $fp1[3];
		my $ss3 = "yespattern";
		my $mcrseq;
		my $checkpattern;
		my $goahead;
		#This block checks if the read should have certain pattern
		if ($p2 =~ /^character/) {
		$mcrseq = $rseq;
		@@ -140,8 +148,7 @@ while (<$fh1>) {

		#This block checks if the read should be read corrected for frameshift in BC pattern
		if ($p3 !~ /^character/) {
		@bla = split($p3,$rseq);
		#next if($#bla != 1);
		my @bla = split($p3,$rseq);
		if ($#bla != 1) {
		$isPass = "fail";
		} else {
		@@ -149,7 +156,7 @@ while (<$fh1>) {
		}
		#print $isPass,"\n";
		if ($isPass ne "fail") {
		$qst = length($rseq) - length($isPass);
		my $qst = length($rseq) - length($isPass);
		$qseq = substr($qseq, $qst);
		$rseq = $isPass;
		}
		@@ -159,7 +166,7 @@ while (<$fh1>) {
		$count=1;
		$phredoffset = distilReads::checkPhred($qseq);
		}
		($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2, $cdc, $lay) =
		my ($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2, $cdc, $lay) =
		("","","","","","","","",0,"SE");

		if ($isPass ne "fail") {
		@@ -167,16 +174,17 @@ while (<$fh1>) {
		distilReads::makeSeqs($rseq,$qseq,$p1,$cdc,$ss3);
		}

		for ($i=1;$i<=$#keys;$i++) {
		$fh = $keys[$i];
		@fp = split(":",$file_handles{$fh});
		$rid1=<$fh>;
		$rseq1=<$fh>;
		$qid1=<$fh>;
		$qseq1=<$fh>;
		$p = $fp[1];
		$pf = $fp[2];
		$pf2 = $fp[3];
		for (my $i=1; $i<=$#keys; $i++) {
		my $fhName = $keys[$i];
		my $fh = $file_handles{$fhName}{"handle"};
		my @fp = split(":",$file_handles{$fhName}{"id"});
		my $rid1=<$fh>;
		my $rseq1=<$fh>;
		my $qid1=<$fh>;
		my $qseq1=<$fh>;
		my $p = $fp[1];
		my $pf = $fp[2];
		my $pf2 = $fp[3];
		$ss3 = "yespattern";

		#This block checks if the read should have certian pattern
		@@ -192,7 +200,7 @@ while (<$fh1>) {
		# If it is smart-seq3 pattern in the YAML file but not found in the read then the read
		# is retained as full cDNA read where UMI is null.
		if ($pf eq "ATTGCGCAATG") {
		$af = substr($mcrseq,0,length($pf));
		my $af = substr($mcrseq,0,length($pf));
		if (Approx::amatch($checkpattern, [ 1 ],$af)) {
		$ss3 = "yespattern";
		$checkpattern = $pf;
		@@ -204,7 +212,7 @@ while (<$fh1>) {

		#This block checks if the read should be read corrected for frameshift in BC pattern
		if ($pf2 !~ /^character/) {
		@bla = split($pf2,$rseq1);
		my @bla = split($pf2,$rseq1);
		#next if($#bla != 1);
		if ($#bla != 1) {
		$isPass = "fail";
		@@ -213,14 +221,14 @@ while (<$fh1>) {
		}
		#print $isPass,"\n";
		if ($isPass ne "fail") {
		$qst = length($rseq1) - length($isPass);
		my $qst = length($rseq1) - length($isPass);
		$qseq1 = substr($qseq1, $qst);
		$rseq1 = $isPass;
		}
		}

		@c = split(/\/\|\s/,$rid);
		@b = split(/\/\|\s/,$rid1);
		my @c = split(/\/\|\s/,$rid);
		my @b = split(/\/\|\s/,$rid1);
		if ($c[0] ne $b[0]) {
		print $c[0],"\n",$b[0],"\n";
		print "ERROR! Fastq files are not in the same order.\n Make sure to provide reads in the same order.\n\n";
		@@ -228,6 +236,7 @@ while (<$fh1>) {
		}

		# get the BC, UMI and cDNA sequences from all the given fastq files and concatenate according to given ranges
		my ($bcseq1, $bcqseq1, $ubseq1, $ubqseq1, $cseq1, $cqseq1, $cseq2, $cqseq2, $cdc, $lay);
		if ($isPass ne "fail") {
		($bcseq1, $bcqseq1, $ubseq1, $ubqseq1, $cseq1, $cqseq1, $cseq2, $cqseq2, $cdc, $lay) =
		distilReads::makeSeqs($rseq1,$qseq1,$p,$cdc,$ss3);
		@@ -244,20 +253,20 @@ while (<$fh1>) {
		#next if($flag = 1); # if correct_Frame is present and the pattern is not found in the read

		# Check the quality filter thresholds given
		@bcthres = split(" ",$BCfilter);
		@umithres = split(" ",$UMIfilter);
		my @bcthres = split(" ",$BCfilter);
		my @umithres = split(" ",$UMIfilter);

		if (($bcthres[0] < 0) \|\| (length($bcthres) > length($bcseq))) {
		if (($bcthres[0] < 0) \|\| (scalar(@bcthres) > length($bcseq))) {
		$bcthres[0] = 1;
		}
		if (($umithres[0] < 0) \|\| (length($umithres) > length($ubseq))) {
		if (($umithres[0] < 0) \|\| (scalar(@umithres) > length($ubseq))) {
		$umithres[0] = 1;
		}
		# map to the correct phredoffset and get the number of bases under given quality threshold
		@bquals = map {$_ - $phredoffset} unpack "C*", $bcqseq;
		@mquals = map {$_ - $phredoffset} unpack "C*", $ubqseq;
		$btmp = grep {$_ < $bcthres[1]} @bquals;
		$mtmp = grep {$_ < $umithres[1]} @mquals;
		my @bquals = map {$_ - $phredoffset} unpack "C*", $bcqseq;
		my @mquals = map {$_ - $phredoffset} unpack "C*", $ubqseq;
		my $btmp = grep {$_ < $bcthres[1]} @bquals;
		my $mtmp = grep {$_ < $umithres[1]} @mquals;

		## Check if it is a smartseq3 pattern. If so, allow for approximate
		## match with 1 base distance. If it is not a smartseq3 pattern,
		@@ -267,7 +276,7 @@ while (<$fh1>) {
		# IF the read should not have any pattern, the $checkpattern is
		# equal to $mcrseq so $goahead variable will stay "yes"
		if ($checkpattern eq "ATTGCGCAATG") {
		$ac = substr($mcrseq,0,length($checkpattern));
		my $ac = substr($mcrseq,0,length($checkpattern));
		if (Approx::amatch($checkpattern, [ 1 ],$ac)) {
		$goahead = "yes";
		} else {
		@@ -284,7 +293,7 @@ while (<$fh1>) {
		# TODO: convert string checks to logical comparisons
		if (($btmp < $bcthres[0]) && ($mtmp < $umithres[0]) && ($goahead eq "yes") && ($isPass ne "fail")) {
		chomp($rid);

		my $ridtmp;
		if ($rid =~ m/^\@.*\s/) {
		$rid =~ m/^\@(.*)\s/;
		$ridtmp = $1;
		@@ -295,7 +304,11 @@ while (<$fh1>) {

		$filtered++;
		$bclist{$bcseq}++;
		#print $lay,"\n";
		if(!$printedHeader){
		print(BCBAM join("\t", ("@"."PG","ID:zUMIs-fqfilter","PN:zUMIs-fqfilter", "VN:2",
		"CL:fqfilter_v2.pl ${argLine}")) . "\n");
		$printedHeader = 1; # true
		}
		if ($lay eq "SE") {
		print BCBAM $ridtmp,"\t4\t\t0\t0\t\t*\t0\t0\t",
		$cseqr1,"\t",$cqseqr1,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
		@@ -305,16 +318,26 @@ while (<$fh1>) {
		print BCBAM $ridtmp,"\t141\t\t0\t0\t\t*\t0\t0\t",
		$cseqr2,"\t",$cqseqr2,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
		}
		} else {
		$discarded++;
		}
		}

		close BCBAM;
		for ($i=0;$i<=$#keys;$i++) {
		close $keys[$i];
		for (my $i=0; $i<=$#keys; $i++) {
		my $fhTmp = $file_handles{$keys[$i]}{"handle"};
		close $fhTmp;
		}

		if ($filtered == 0) {
		print(STDERR "No reads emitted; please check the parameter file and the warnings above.\n");
		exit;
		} else {
		print(STDERR "reads kept: $filtered; reads discarded: $discarded\n");
		}

		open BCOUT, '>', $outbcstats \|\| die "Couldn't open file ".$outbcstats.". Check permissions!\n";
		foreach $bc (keys %bclist) {
		foreach my $bc (keys %bclist) {
		print BCOUT $bc,"\t",$bclist{$bc},"\n";
		}
		close BCOUT;

Admin message