Commit 74f81d46 authored by David Eccles (gringer)'s avatar David Eccles (gringer)
Browse files

Janitorial: make indentation / bracing / comment layout consistent; remove dead comments

parent 3985416c
Loading
Loading
Loading
Loading
+175 −166
Original line number Diff line number Diff line
#!/usr/bin/perl
#!/usr/bin/env perl
use warnings;
use strict;
# LMU Munich. AG Enard
# A script to filter reads based on Barcode base quality.

if(@ARGV != 6)
{
if (@ARGV != 6) {
  print
"\n#####################################################################################
Usage: perl $0 <yaml> <samtools-executable> <rscript-executable> <pigz-executable> <zUMIs-dir> <tmpPrefix>\n
Please drop your suggestions and clarifications to <sparekh\@age.mpg.de>\n
######################################################################################\n\n";
    "\n#####################################################################################".
    "Usage: perl $0 <yaml> <samtools-executable> <rscript-executable> <pigz-executable> <zUMIs-dir> <tmpPrefix>\n".
    "Please drop your suggestions and clarifications to <sparekh\@age.mpg.de>\n".
    "######################################################################################\n\n";
  exit;
}

BEGIN {
  ## TODO: use `(a,b,c) = @ARGV`
  $yml=$ARGV[0];
  $samtoolsexc=$ARGV[1];
  $rscriptexc=$ARGV[2];
@@ -19,6 +22,7 @@ $pigz=$ARGV[3];
  $zumisdir=$ARGV[4];
  $tmpPrefix=$ARGV[5];
}

use lib "$zumisdir";
use distilReads;
use Approx;
@@ -27,14 +31,15 @@ open(YL,"$rscriptexc $zumisdir/readYaml4fqfilter.R $yml |");
@arg=<YL>;
close YL;
%argHash;
@params=("filenames", "seqtype", "outdir", "StudyName", "num_threads", "BCfilter", "UMIfilter", "find_pattern", "correct_frameshift");

@params=("filenames", "seqtype", "outdir", "StudyName", "num_threads",
         "BCfilter", "UMIfilter", "find_pattern", "correct_frameshift");

for ($i=0;$i<=$#params;$i++) {
  $argHash{$params[$i]}=$arg[$i];
}

# parse the fastqfiles and make a hash with file handles as key and filename.pattern as value
# TODO: convert to `map {$_ = distilReads::argClean($argHash{$_}); chomp} @params`, or similar
$f = distilReads::argClean($argHash{"filenames"});
$st = distilReads::argClean($argHash{"seqtype"});
$outdir = distilReads::argClean($argHash{"outdir"});
@@ -45,10 +50,6 @@ $UMIfilter = distilReads::argClean($argHash{"UMIfilter"});
$pattern = distilReads::argClean($argHash{"find_pattern"});
$frameshift = distilReads::argClean($argHash{"correct_frameshift"});


#/data/share/htp/Project_mcSCRB-seqPaper/PEG_May2017/demult_HEK_r1.fq.gz; /data/share/htp/Project_mcSCRB-seqPaper/PEG_May2017/demult_HEK_r2.fq.gz;ACTGCTGTA
#if find_pattern exists, readYaml4fqfilter returns  "ATTGCGCAATG character(0) character(0)"

chomp($f);
chomp($st);
chomp($outdir);
@@ -80,15 +81,15 @@ for($i=0;$i<=$#keys;$i++){

    #change the file name to temporary prefix for its chunk
    $chunk = "$outdir/zUMIs_output/.tmpMerge/$oriBase$tmpPrefix.gz";
    open $fh, '-|', $pigz, '-dc', $chunk || die "Couldn't open file ".$chunk.". Check permissions!\n Check if it is differently zipped then .gz\n\n";
    open $fh, '-|', $pigz, '-dc', $chunk || die "Couldn't open file ".$chunk.
      ". Check permissions!\n Check if it is differently zipped then .gz\n\n";
  } else {

    $oriF = $fp[0];
    $oriBase = `basename $oriF'`;
    #change the file name to temporary prefix for its chunk
    $chunk = "$outdir/zUMIs_output/.tmpMerge/$oriBase$tmpPrefix.gz";

    open $fh, "<", $chunk || die "Couldn't open file ".$chunk.". Check permissions!\n Check if it is differently zipped then .gz\n\n";
    open $fh, "<", $chunk || die "Couldn't open file ".$chunk.
      ". Check permissions!\n Check if it is differently zipped then .gz\n\n";
  }
}

@@ -114,18 +115,18 @@ while(<$fh1>){
  $p3 = $fp1[3];
  $ss3 = "yespattern";
  #$flag = 0;
#This block checks if the read should have certian pattern
  #This block checks if the read should have certain pattern
  if ($p2 =~ /^character/) {
    $mcrseq = $rseq;
    $checkpattern = $rseq;
  }
  else{
  } else {
    $mcrseq = $rseq;
    $checkpattern = $p2;
  }

  # This block checks if smart-seq3 pattern is present and if it is found in the reads
  # If it is smart-seq3 pattern in the YAML file but not found in the read then the read is retained as full cDNA read where UMI is null.
  # If it is smart-seq3 pattern in the YAML file but not found in the read then the read
  #   is retained as full cDNA read where UMI is null.
  if ($p2 eq "ATTGCGCAATG") {
    $a = substr($mcrseq,0,length($p2));
    if (Approx::amatch($checkpattern, [ 1 ],$a)) {
@@ -137,8 +138,6 @@ while(<$fh1>){
    }
  }



  #This block checks if the read should be read corrected for frameshift in BC pattern
  if ($p3 !~ /^character/) {
    @bla = split($p3,$rseq);
@@ -160,16 +159,17 @@ while(<$fh1>){
    $count=1;
    $phredoffset = distilReads::checkPhred($qseq);
  }
  ($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2, $cdc, $lay) = ("","","","","","","","",0,"SE");
  ($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2, $cdc, $lay) =
    ("","","","","","","","",0,"SE");

  if ($isPass ne "fail") {
    ($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2, $cdc, $lay) = distilReads::makeSeqs($rseq,$qseq,$p1,$cdc,$ss3);
    ($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2, $cdc, $lay) =
      distilReads::makeSeqs($rseq,$qseq,$p1,$cdc,$ss3);
  }

  for ($i=1;$i<=$#keys;$i++) {
    $fh = $keys[$i];
    @fp = split(":",$file_handles{$fh});
  #  $flag = 0;
    $rid1=<$fh>;
    $rseq1=<$fh>;
    $qid1=<$fh>;
@@ -183,14 +183,14 @@ while(<$fh1>){
    if ($pf =~ /^character/) {
      $mcrseq = $rseq1;
      $checkpattern = $rseq1;
      }
      else{
    } else {
      $mcrseq = $rseq1;
      $checkpattern = $pf;
    }

    # This block checks if smart-seq3 pattern is present and if it is found in the reads
      # If it is smart-seq3 pattern in the YAML file but not found in the read then the read is retained as full cDNA read where UMI is null.
    # If it is smart-seq3 pattern in the YAML file but not found in the read then the read
    #   is retained as full cDNA read where UMI is null.
    if ($pf eq "ATTGCGCAATG") {
      $af = substr($mcrseq,0,length($pf));
      if (Approx::amatch($checkpattern, [ 1 ],$af)) {
@@ -229,15 +229,17 @@ while(<$fh1>){

    # get the BC, UMI and cDNA sequences from all the given fastq files and concatenate according to given ranges
    if ($isPass ne "fail") {
      ($bcseq1, $bcqseq1, $ubseq1, $ubqseq1, $cseq1, $cqseq1, $cseq2, $cqseq2, $cdc, $lay) = distilReads::makeSeqs($rseq1,$qseq1,$p,$cdc,$ss3);
    }
    else
    {
      ($bcseq1, $bcqseq1, $ubseq1, $ubqseq1, $cseq1, $cqseq1, $cseq2, $cqseq2, $cdc, $lay) = ("","","","","","","","",0,"SE");
      ($bcseq1, $bcqseq1, $ubseq1, $ubqseq1, $cseq1, $cqseq1, $cseq2, $cqseq2, $cdc, $lay) =
        distilReads::makeSeqs($rseq1,$qseq1,$p,$cdc,$ss3);
    } else {
      ($bcseq1, $bcqseq1, $ubseq1, $ubqseq1, $cseq1, $cqseq1, $cseq2, $cqseq2, $cdc, $lay) =
        ("","","","","","","","",0,"SE");
    }

    # concatenate according to given ranges with other files
    ($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2) = ($bcseq.$bcseq1, $bcqseq.$bcqseq1, $ubseq.$ubseq1, $ubqseq.$ubqseq1, $cseqr1.$cseq1, $cqseqr1.$cqseq1, $cseqr2.$cseq2, $cqseqr2.$cqseq2);
    ($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2) =
      ($bcseq.$bcseq1, $bcqseq.$bcqseq1, $ubseq.$ubseq1, $ubqseq.$ubqseq1,
       $cseqr1.$cseq1, $cqseqr1.$cqseq1, $cseqr2.$cseq2, $cqseqr2.$cqseq2);
  }
  #next if($flag = 1); # if correct_Frame is present and the pattern is not found in the read

@@ -257,8 +259,13 @@ while(<$fh1>){
  $btmp = grep {$_ < $bcthres[1]} @bquals;
  $mtmp = grep {$_ < $umithres[1]} @mquals;

## Check if it is a smartseq3 pattern. If so, allow for approximate match with 1 base distance. If it is not a smartseq3 pattern, check if the read starts with the pattern at all. The $goahead variable decides if the read passes the quality threshold of a pattern.
# IF the read should not have any pattern, the $checkpattern is equal to $mcrseq so $goahead variable will stay "yes"
  ## Check if it is a smartseq3 pattern. If so, allow for approximate
  ## match with 1 base distance. If it is not a smartseq3 pattern,
  ## check if the read starts with the pattern at all. The $goahead
  ## variable decides if the read passes the quality threshold of a
  ## pattern.
  # IF the read should not have any pattern, the $checkpattern is
  # equal to $mcrseq so $goahead variable will stay "yes"
  if ($checkpattern eq "ATTGCGCAATG") {
    $ac = substr($mcrseq,0,length($checkpattern));
    if (Approx::amatch($checkpattern, [ 1 ],$ac)) {
@@ -274,18 +281,14 @@ while(<$fh1>){
    }
  }
  # print out only if above the quality threshold

     #if(($btmp < $bcthres[0]) && ($mtmp < $umithres[0]) && ( Approx::amatch($checkpattern, [ 1 ],$mcrseq) ) && ($isPass ne "fail")){
     #if(($btmp < $bcthres[0]) && ($mtmp < $umithres[0]) && ($mcrseq =~ m/^$checkpattern/) && ($isPass ne "fail")){
     #if(($btmp < $bcthres[0]) && ($mtmp < $umithres[0])){
  # TODO: convert string checks to logical comparisons
  if (($btmp < $bcthres[0]) && ($mtmp < $umithres[0]) && ($goahead eq "yes") && ($isPass ne "fail")) {
    chomp($rid);

    if ($rid =~ m/^\@.*\s/) {
      $rid =~ m/^\@(.*)\s/;
      $ridtmp = $1;
      }
      else{
    } else {
      $rid =~ m/^\@(.*)/;
      $ridtmp = $1;
    }
@@ -294,15 +297,21 @@ while(<$fh1>){
    $bclist{$bcseq}++;
    #print $lay,"\n";
    if ($lay eq "SE") {
        print BCBAM $ridtmp,"\t4\t*\t0\t0\t*\t*\t0\t0\t",$cseqr1,"\t",$cqseqr1,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
      print BCBAM $ridtmp,"\t4\t*\t0\t0\t*\t*\t0\t0\t",
        $cseqr1,"\t",$cqseqr1,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
    } else {
        print BCBAM $ridtmp,"\t77\t*\t0\t0\t*\t*\t0\t0\t",$cseqr1,"\t",$cqseqr1,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
        print BCBAM $ridtmp,"\t141\t*\t0\t0\t*\t*\t0\t0\t",$cseqr2,"\t",$cqseqr2,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
      print BCBAM $ridtmp,"\t77\t*\t0\t0\t*\t*\t0\t0\t",
        $cseqr1,"\t",$cqseqr1,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
      print BCBAM $ridtmp,"\t141\t*\t0\t0\t*\t*\t0\t0\t",
        $cseqr2,"\t",$cqseqr2,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
    }
  }
}

close BCBAM;
for($i=0;$i<=$#keys;$i++){  close $keys[$i]; }
for ($i=0;$i<=$#keys;$i++) {
  close $keys[$i];
}

open BCOUT, '>', $outbcstats || die "Couldn't open file ".$outbcstats.". Check permissions!\n";
foreach $bc (keys %bclist) {