Commit f10d7eb2 authored by David Eccles (gringer)'s avatar David Eccles (gringer)
Browse files

Janitorial: fix up compiler warnings; add BAM file header

parent 74f81d46
Loading
Loading
Loading
Loading
+11 −6
Original line number Original line Diff line number Diff line
@@ -27,9 +27,10 @@ sub makeFileHandles{


    $fh="file".$j;
    $fh="file".$j;


    if ( open $fh, '<', $file ) {
    if ( open $fhTmp, '<', $file ) {
      $file_handles{ $fh } = $file.":".$pat.":".$fpat.":".$cframe;
      $file_handles{ $fh }{"id"} = $file.":".$pat.":".$fpat.":".$cframe;
      close $fh;
      $file_handles{ $fh }{"handle"} = $fhTmp;
      close $fhTmp;
    }
    }
    else {
    else {
      warn "couldn't open $file for reading: $!\n";
      warn "couldn't open $file for reading: $!\n";
@@ -39,8 +40,12 @@ sub makeFileHandles{
}
}


sub checkPhred{
sub checkPhred{
  my ($bqseq) = @_;
  @quals = map {$_} unpack "C*", $bqseq;
  @quals = map {$_} unpack "C*", $bqseq;
    if(grep {$_ > 74} @quals){$offset=64;}else{$offset=33;}
  my $offset = 33;
  if (grep {$_ > 74} @quals) {
    $offset = 64;
  }
  return $offset;
  return $offset;
}
}


+109 −86
Original line number Original line Diff line number Diff line
@@ -13,14 +13,11 @@ if (@ARGV != 6) {
  exit;
  exit;
}
}


my $argLine = join(" ", @ARGV);

my ($yml, $samtoolsexc, $rscriptexc, $pigz, $zumisdir, $tmpPrefix);
BEGIN {
BEGIN {
  ## TODO: use `(a,b,c) = @ARGV`
  ($yml, $samtoolsexc, $rscriptexc, $pigz, $zumisdir, $tmpPrefix) = @ARGV;
  $yml=$ARGV[0];
  $samtoolsexc=$ARGV[1];
  $rscriptexc=$ARGV[2];
  $pigz=$ARGV[3];
  $zumisdir=$ARGV[4];
  $tmpPrefix=$ARGV[5];
}
}


use lib "$zumisdir";
use lib "$zumisdir";
@@ -28,27 +25,29 @@ use distilReads;
use Approx;
use Approx;


open(YL,"$rscriptexc $zumisdir/readYaml4fqfilter.R $yml |");
open(YL,"$rscriptexc $zumisdir/readYaml4fqfilter.R $yml |");
@arg=<YL>;
my @arg=<YL>;
close YL;
close YL;
%argHash;
my %argHash;
@params=("filenames", "seqtype", "outdir", "StudyName", "num_threads",
my @params=("filenames", "seqtype", "outdir", "StudyName", "num_threads",
            "BCfilter", "UMIfilter", "find_pattern", "correct_frameshift");
            "BCfilter", "UMIfilter", "find_pattern", "correct_frameshift");


for ($i=0;$i<=$#params;$i++) {
for (my $i=0; $i<=$#params; $i++) {
  $argHash{$params[$i]}=$arg[$i];
  $argHash{$params[$i]}=$arg[$i];
}
}


# parse the fastqfiles and make a hash with file handles as key and filename.pattern as value
# parse the fastqfiles and make a hash with file handles as key and filename.pattern as value
# TODO: convert to `map {$_ = distilReads::argClean($argHash{$_}); chomp} @params`, or similar
# TODO: convert to `map {$_ = distilReads::argClean($argHash{$_}); chomp} @params`, or similar
$f = distilReads::argClean($argHash{"filenames"});
#       or better: use argHash directly
$st = distilReads::argClean($argHash{"seqtype"});
my ($f, $st, $outdir, $StudyName, $num_threads, $BCfilter, $UMIfilter, $pattern, $frameshift) =
$outdir = distilReads::argClean($argHash{"outdir"});
  (distilReads::argClean($argHash{"filenames"}),
$StudyName = distilReads::argClean($argHash{"StudyName"});
   distilReads::argClean($argHash{"seqtype"}),
$num_threads = distilReads::argClean($argHash{"num_threads"});
   distilReads::argClean($argHash{"outdir"}),
$BCfilter = distilReads::argClean($argHash{"BCfilter"});
   distilReads::argClean($argHash{"StudyName"}),
$UMIfilter = distilReads::argClean($argHash{"UMIfilter"});
   distilReads::argClean($argHash{"num_threads"}),
$pattern = distilReads::argClean($argHash{"find_pattern"});
   distilReads::argClean($argHash{"BCfilter"}),
$frameshift = distilReads::argClean($argHash{"correct_frameshift"});
   distilReads::argClean($argHash{"UMIfilter"}),
   distilReads::argClean($argHash{"find_pattern"}),
   distilReads::argClean($argHash{"correct_frameshift"}));


chomp($f);
chomp($f);
chomp($st);
chomp($st);
@@ -59,62 +58,71 @@ chomp($BCfilter);
chomp($UMIfilter);
chomp($UMIfilter);
chomp($pattern);
chomp($pattern);
chomp($frameshift);
chomp($frameshift);
$isPass="pass";
my $isPass="pass";


$outbcstats = "$outdir/zUMIs_output/.tmpMerge/$StudyName.$tmpPrefix.BCstats.txt";
my $outbcstats = "$outdir/zUMIs_output/.tmpMerge/$StudyName.$tmpPrefix.BCstats.txt";
$outbam = "$outdir/zUMIs_output/.tmpMerge/$StudyName.$tmpPrefix.filtered.tagged.bam";
my $outbam = "$outdir/zUMIs_output/.tmpMerge/$StudyName.$tmpPrefix.filtered.tagged.bam";


# Make and open all the file handles
# Make and open all the file handles
%file_handles = distilReads::makeFileHandles($f,$st,$pattern,$frameshift);
my %file_handles = distilReads::makeFileHandles($f,$st,$pattern,$frameshift);


# get all the filehandles in @keys
# get all the filehandles in @keys
@keys = sort(keys %file_handles);
my @keys = sort(keys %file_handles);


for ($i=0;$i<=$#keys;$i++) {
for (my $i=0; $i<=$#keys; $i++) {
  $fh = $keys[$i];
  my $fhName = $keys[$i];
  @fp = split(":",$file_handles{$fh});
  my $fh = $file_handles{$fhName}{"handle"};
  my @fp = split(":",$file_handles{$fhName}{"id"});


  if ($fp[0] =~ /\.gz$/) {
  if ($fp[0] =~ /\.gz$/) { # check file name
    $oriF = $fp[0];
    my $oriF = $fp[0];
    $oriBase = `basename $oriF .gz`;
    my $oriBase = `basename $oriF .gz`;
    chomp($oriBase);
    chomp($oriBase);


    #change the file name to temporary prefix for its chunk
    #change the file name to temporary prefix for its chunk
    $chunk = "$outdir/zUMIs_output/.tmpMerge/$oriBase$tmpPrefix.gz";
    my $chunk = "$outdir/zUMIs_output/.tmpMerge/$oriBase$tmpPrefix.gz";
    open $fh, '-|', $pigz, '-dc', $chunk || die "Couldn't open file ".$chunk.
    open my $fhTmp, '-|', $pigz, '-dc', $chunk || die "Couldn't open file ".$chunk.
      ". Check permissions!\n Check if it is differently zipped then .gz\n\n";
      ". Check permissions!\n Check if it is differently zipped then .gz\n\n";
    $file_handles{$fhName}{"handle"} = $fhTmp;
  } else {
  } else {
    $oriF = $fp[0];
    my $oriF = $fp[0];
    $oriBase = `basename $oriF'`;
    my $oriBase = `basename $oriF'`;
    #change the file name to temporary prefix for its chunk
    #change the file name to temporary prefix for its chunk
    $chunk = "$outdir/zUMIs_output/.tmpMerge/$oriBase$tmpPrefix.gz";
    my $chunk = "$outdir/zUMIs_output/.tmpMerge/$oriBase$tmpPrefix.gz";
    open $fh, "<", $chunk || die "Couldn't open file ".$chunk.
    open my $fhTmp, "<", $chunk || die "Couldn't open file ".$chunk.
      ". Check permissions!\n Check if it is differently zipped then .gz\n\n";
      ". Check permissions!\n Check if it is differently zipped then .gz\n\n";
    $file_handles{$fhName}{"handle"} = $fhTmp;
  }
  }
}
}


$total = 0;
my $total = 0;
$filtered = 0;
my $filtered = 0;
%bclist;
my $discarded = 0;
my $phredoffset;
my %bclist;


open(BCBAM,"| $samtoolsexc view -Sb - > $outbam");
open(BCBAM,"| $samtoolsexc view -Sb - > $outbam");


# First file handle to start the while loop for the first file
# First file handle to start the while loop for the first file
$fh1 = $keys[0];
my $fhName1 = $keys[0];
@fp1 = split(":",$file_handles{$fh1});
my $fh1 = $file_handles{$fhName1}{"handle"};
$count = 0;
my @fp1 = split(":",$file_handles{$fhName1}{"id"});
my $count = 0;
my $printedHeader = 0; # false
# reading the first file while others are processed in parallel within
# reading the first file while others are processed in parallel within
while (<$fh1>) {
while (<$fh1>) {
  $total++;
  $total++;
  $rid=$_;
  my $rid=$_;
  $rseq=<$fh1>;
  my $rseq=<$fh1>;
  $qid=<$fh1>;
  my $qid=<$fh1>;
  $qseq=<$fh1>;
  my $qseq=<$fh1>;
  $p1 = $fp1[1];
  my $p1 = $fp1[1];
  $p2 = $fp1[2];
  my $p2 = $fp1[2];
  $p3 = $fp1[3];
  my $p3 = $fp1[3];
  $ss3 = "yespattern";
  my $ss3 = "yespattern";
  #$flag = 0;
  my $mcrseq;
  my $checkpattern;
  my $goahead;
  #This block checks if the read should have certain pattern
  #This block checks if the read should have certain pattern
  if ($p2 =~ /^character/) {
  if ($p2 =~ /^character/) {
    $mcrseq = $rseq;
    $mcrseq = $rseq;
@@ -140,8 +148,7 @@ while (<$fh1>) {


  #This block checks if the read should be read corrected for frameshift in BC pattern
  #This block checks if the read should be read corrected for frameshift in BC pattern
  if ($p3 !~ /^character/) {
  if ($p3 !~ /^character/) {
    @bla = split($p3,$rseq);
    my @bla = split($p3,$rseq);
    #next if($#bla != 1);
    if ($#bla != 1) {
    if ($#bla != 1) {
      $isPass = "fail";
      $isPass = "fail";
    } else {
    } else {
@@ -149,7 +156,7 @@ while (<$fh1>) {
    }
    }
    #print $isPass,"\n";
    #print $isPass,"\n";
    if ($isPass ne "fail") {
    if ($isPass ne "fail") {
      $qst = length($rseq) - length($isPass);
      my $qst = length($rseq) - length($isPass);
      $qseq = substr($qseq, $qst);
      $qseq = substr($qseq, $qst);
      $rseq = $isPass;
      $rseq = $isPass;
    }
    }
@@ -159,7 +166,7 @@ while (<$fh1>) {
    $count=1;
    $count=1;
    $phredoffset = distilReads::checkPhred($qseq);
    $phredoffset = distilReads::checkPhred($qseq);
  }
  }
  ($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2, $cdc, $lay) =
  my ($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2, $cdc, $lay) =
    ("","","","","","","","",0,"SE");
    ("","","","","","","","",0,"SE");


  if ($isPass ne "fail") {
  if ($isPass ne "fail") {
@@ -167,16 +174,17 @@ while (<$fh1>) {
      distilReads::makeSeqs($rseq,$qseq,$p1,$cdc,$ss3);
      distilReads::makeSeqs($rseq,$qseq,$p1,$cdc,$ss3);
  }
  }


  for ($i=1;$i<=$#keys;$i++) {
  for (my $i=1; $i<=$#keys; $i++) {
    $fh = $keys[$i];
    my $fhName = $keys[$i];
    @fp = split(":",$file_handles{$fh});
    my $fh = $file_handles{$fhName}{"handle"};
    $rid1=<$fh>;
    my @fp = split(":",$file_handles{$fhName}{"id"});
    $rseq1=<$fh>;
    my $rid1=<$fh>;
    $qid1=<$fh>;
    my $rseq1=<$fh>;
    $qseq1=<$fh>;
    my $qid1=<$fh>;
    $p = $fp[1];
    my $qseq1=<$fh>;
    $pf = $fp[2];
    my $p = $fp[1];
    $pf2 = $fp[3];
    my $pf = $fp[2];
    my $pf2 = $fp[3];
    $ss3 = "yespattern";
    $ss3 = "yespattern";


    #This block checks if the read should have certian pattern
    #This block checks if the read should have certian pattern
@@ -192,7 +200,7 @@ while (<$fh1>) {
    # If it is smart-seq3 pattern in the YAML file but not found in the read then the read
    # If it is smart-seq3 pattern in the YAML file but not found in the read then the read
    #   is retained as full cDNA read where UMI is null.
    #   is retained as full cDNA read where UMI is null.
    if ($pf eq "ATTGCGCAATG") {
    if ($pf eq "ATTGCGCAATG") {
      $af = substr($mcrseq,0,length($pf));
      my $af = substr($mcrseq,0,length($pf));
      if (Approx::amatch($checkpattern, [ 1 ],$af)) {
      if (Approx::amatch($checkpattern, [ 1 ],$af)) {
        $ss3 = "yespattern";
        $ss3 = "yespattern";
        $checkpattern = $pf;
        $checkpattern = $pf;
@@ -204,7 +212,7 @@ while (<$fh1>) {


    #This block checks if the read should be read corrected for frameshift in BC pattern
    #This block checks if the read should be read corrected for frameshift in BC pattern
    if ($pf2 !~ /^character/) {
    if ($pf2 !~ /^character/) {
      @bla = split($pf2,$rseq1);
      my @bla = split($pf2,$rseq1);
      #next if($#bla != 1);
      #next if($#bla != 1);
      if ($#bla != 1) {
      if ($#bla != 1) {
        $isPass = "fail";
        $isPass = "fail";
@@ -213,14 +221,14 @@ while (<$fh1>) {
      }
      }
      #print $isPass,"\n";
      #print $isPass,"\n";
      if ($isPass ne "fail") {
      if ($isPass ne "fail") {
        $qst = length($rseq1) - length($isPass);
        my $qst = length($rseq1) - length($isPass);
        $qseq1 = substr($qseq1, $qst);
        $qseq1 = substr($qseq1, $qst);
        $rseq1 = $isPass;
        $rseq1 = $isPass;
      }
      }
    }
    }


    @c = split(/\/|\s/,$rid);
    my @c = split(/\/|\s/,$rid);
    @b = split(/\/|\s/,$rid1);
    my @b = split(/\/|\s/,$rid1);
    if ($c[0] ne $b[0]) {
    if ($c[0] ne $b[0]) {
      print $c[0],"\n",$b[0],"\n";
      print $c[0],"\n",$b[0],"\n";
      print "ERROR! Fastq files are not in the same order.\n Make sure to provide reads in the same order.\n\n";
      print "ERROR! Fastq files are not in the same order.\n Make sure to provide reads in the same order.\n\n";
@@ -228,6 +236,7 @@ while (<$fh1>) {
    }
    }


    # get the BC, UMI and cDNA sequences from all the given fastq files and concatenate according to given ranges
    # get the BC, UMI and cDNA sequences from all the given fastq files and concatenate according to given ranges
    my ($bcseq1, $bcqseq1, $ubseq1, $ubqseq1, $cseq1, $cqseq1, $cseq2, $cqseq2, $cdc, $lay);
    if ($isPass ne "fail") {
    if ($isPass ne "fail") {
      ($bcseq1, $bcqseq1, $ubseq1, $ubqseq1, $cseq1, $cqseq1, $cseq2, $cqseq2, $cdc, $lay) =
      ($bcseq1, $bcqseq1, $ubseq1, $ubqseq1, $cseq1, $cqseq1, $cseq2, $cqseq2, $cdc, $lay) =
        distilReads::makeSeqs($rseq1,$qseq1,$p,$cdc,$ss3);
        distilReads::makeSeqs($rseq1,$qseq1,$p,$cdc,$ss3);
@@ -244,20 +253,20 @@ while (<$fh1>) {
  #next if($flag = 1); # if correct_Frame is present and the pattern is not found in the read
  #next if($flag = 1); # if correct_Frame is present and the pattern is not found in the read


  # Check the quality filter thresholds given
  # Check the quality filter thresholds given
  @bcthres = split(" ",$BCfilter);
  my @bcthres = split(" ",$BCfilter);
  @umithres = split(" ",$UMIfilter);
  my @umithres = split(" ",$UMIfilter);


  if (($bcthres[0] < 0) || (length($bcthres) > length($bcseq))) {
  if (($bcthres[0] < 0) || (scalar(@bcthres) > length($bcseq))) {
    $bcthres[0] = 1;
    $bcthres[0] = 1;
  }
  }
  if (($umithres[0] < 0) || (length($umithres) > length($ubseq))) {
  if (($umithres[0] < 0) || (scalar(@umithres) > length($ubseq))) {
    $umithres[0] = 1;
    $umithres[0] = 1;
  }
  }
  # map to the correct phredoffset and get the number of bases under given quality threshold
  # map to the correct phredoffset and get the number of bases under given quality threshold
  @bquals = map {$_ - $phredoffset} unpack "C*", $bcqseq;
  my @bquals = map {$_ - $phredoffset} unpack "C*", $bcqseq;
  @mquals = map {$_ - $phredoffset} unpack "C*", $ubqseq;
  my @mquals = map {$_ - $phredoffset} unpack "C*", $ubqseq;
  $btmp = grep {$_ < $bcthres[1]} @bquals;
  my $btmp = grep {$_ < $bcthres[1]} @bquals;
  $mtmp = grep {$_ < $umithres[1]} @mquals;
  my $mtmp = grep {$_ < $umithres[1]} @mquals;


  ## Check if it is a smartseq3 pattern. If so, allow for approximate
  ## Check if it is a smartseq3 pattern. If so, allow for approximate
  ## match with 1 base distance. If it is not a smartseq3 pattern,
  ## match with 1 base distance. If it is not a smartseq3 pattern,
@@ -267,7 +276,7 @@ while (<$fh1>) {
  # IF the read should not have any pattern, the $checkpattern is
  # IF the read should not have any pattern, the $checkpattern is
  # equal to $mcrseq so $goahead variable will stay "yes"
  # equal to $mcrseq so $goahead variable will stay "yes"
  if ($checkpattern eq "ATTGCGCAATG") {
  if ($checkpattern eq "ATTGCGCAATG") {
    $ac = substr($mcrseq,0,length($checkpattern));
    my $ac = substr($mcrseq,0,length($checkpattern));
    if (Approx::amatch($checkpattern, [ 1 ],$ac)) {
    if (Approx::amatch($checkpattern, [ 1 ],$ac)) {
      $goahead = "yes";
      $goahead = "yes";
    } else {
    } else {
@@ -284,7 +293,7 @@ while (<$fh1>) {
  # TODO: convert string checks to logical comparisons
  # TODO: convert string checks to logical comparisons
  if (($btmp < $bcthres[0]) && ($mtmp < $umithres[0]) && ($goahead eq "yes") && ($isPass ne "fail")) {
  if (($btmp < $bcthres[0]) && ($mtmp < $umithres[0]) && ($goahead eq "yes") && ($isPass ne "fail")) {
    chomp($rid);
    chomp($rid);

    my $ridtmp;
    if ($rid =~ m/^\@.*\s/) {
    if ($rid =~ m/^\@.*\s/) {
      $rid =~ m/^\@(.*)\s/;
      $rid =~ m/^\@(.*)\s/;
      $ridtmp = $1;
      $ridtmp = $1;
@@ -295,7 +304,11 @@ while (<$fh1>) {


    $filtered++;
    $filtered++;
    $bclist{$bcseq}++;
    $bclist{$bcseq}++;
    #print $lay,"\n";
    if(!$printedHeader){
      print(BCBAM join("\t", ("@"."PG","ID:zUMIs-fqfilter","PN:zUMIs-fqfilter", "VN:2",
                              "CL:fqfilter_v2.pl ${argLine}")) . "\n");
      $printedHeader = 1; # true
    }
    if ($lay eq "SE") {
    if ($lay eq "SE") {
      print BCBAM $ridtmp,"\t4\t*\t0\t0\t*\t*\t0\t0\t",
      print BCBAM $ridtmp,"\t4\t*\t0\t0\t*\t*\t0\t0\t",
        $cseqr1,"\t",$cqseqr1,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
        $cseqr1,"\t",$cqseqr1,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
@@ -305,16 +318,26 @@ while (<$fh1>) {
      print BCBAM $ridtmp,"\t141\t*\t0\t0\t*\t*\t0\t0\t",
      print BCBAM $ridtmp,"\t141\t*\t0\t0\t*\t*\t0\t0\t",
        $cseqr2,"\t",$cqseqr2,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
        $cseqr2,"\t",$cqseqr2,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
    }
    }
  } else {
    $discarded++;
  }
  }
}
}


close BCBAM;
close BCBAM;
for ($i=0;$i<=$#keys;$i++) {
for (my $i=0; $i<=$#keys; $i++) {
  close $keys[$i];
  my $fhTmp = $file_handles{$keys[$i]}{"handle"};
  close $fhTmp;
}

if ($filtered == 0) {
  print(STDERR "No reads emitted; please check the parameter file and the warnings above.\n");
  exit;
} else {
  print(STDERR "reads kept: $filtered; reads discarded: $discarded\n");
}
}


open BCOUT, '>', $outbcstats || die "Couldn't open file ".$outbcstats.". Check permissions!\n";
open BCOUT, '>', $outbcstats || die "Couldn't open file ".$outbcstats.". Check permissions!\n";
foreach $bc (keys %bclist) {
foreach my $bc (keys %bclist) {
  print BCOUT $bc,"\t",$bclist{$bc},"\n";
  print BCOUT $bc,"\t",$bclist{$bc},"\n";
}
}
close BCOUT;
close BCOUT;