Commit 2cacbed9 authored by cziegenhain's avatar cziegenhain
Browse files

revert fqfilter to remove PE bug

parent d8edde4b
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -29,6 +29,8 @@ We provide a script to convert zUMIs output into loom file automatically based o
zUMIs will try to automatically do this, otherwise convert zUMIs output to loom by simply running `Rscript rds2loom.R myRun.yaml`.

## Changelog
10 July 2020: [zUMIs2.9.1](https://github.com/sdparekh/zUMIs/releases/tag/2.9.1): Revert changes from pull request #194 (commit  #ff7e879) that introduced a bug when using PE reads.

05 July 2020: [zUMIs2.9.0 released](https://github.com/sdparekh/zUMIs/releases/tag/2.9.0): Speed up STAR read mapping by parallel instances if enough resources are available. Change the main zUMIs script to `zUMIs.sh`. Speed up and reduce clutter by loading reads from bam files using parallelised Rsamtools calls instead of printing temporary text files. Speed up counting by parallelising exon / intron / exon+intron counting as well as downsamplings. Speed up by parallelising creation of wide format count matrices. 

23 June 2020: zUMIs2.8.3: Merged code contribution from @gringer: prevent errors by emitting SAM headers in chunked unmapped .bam file output of fqfilter. Changed call to STAR to prevent stalling of samtools pipe. 

distilReads.pm

100755 → 100644
+6 −11
Original line number Diff line number Diff line
@@ -27,10 +27,9 @@ sub makeFileHandles{

    $fh="file".$j;

    if ( open $fhTmp, '<', $file ) {
      $file_handles{ $fh }{"id"} = $file.":".$pat.":".$fpat.":".$cframe;
      $file_handles{ $fh }{"handle"} = $fhTmp;
      close $fhTmp;
    if ( open $fh, '<', $file ) {
      $file_handles{ $fh } = $file.":".$pat.":".$fpat.":".$cframe;
      close $fh;
    }
    else {
      warn "couldn't open $file for reading: $!\n";
@@ -40,12 +39,8 @@ sub makeFileHandles{
}

sub checkPhred{
  my ($bqseq) = @_;
    @quals = map {$_} unpack "C*", $bqseq;
  my $offset = 33;
  if (grep {$_ > 74} @quals) {
    $offset = 64;
  }
    if(grep {$_ > 74} @quals){$offset=64;}else{$offset=33;}
    return $offset;
}

+205 −237
Original line number Diff line number Diff line
#!/usr/bin/env perl
use warnings;
use strict;
#!/usr/bin/perl
# LMU Munich. AG Enard
# A script to filter reads based on Barcode base quality.

if (@ARGV != 6) {
if(@ARGV != 6)
{
print
    "\n#####################################################################################".
    "Usage: perl $0 <yaml> <samtools-executable> <rscript-executable> <pigz-executable> <zUMIs-dir> <tmpPrefix>\n".
    "Please drop your suggestions and clarifications to <sparekh\@age.mpg.de>\n".
    "######################################################################################\n\n";
"\n#####################################################################################
Usage: perl $0 <yaml> <samtools-executable> <rscript-executable> <pigz-executable> <zUMIs-dir> <tmpPrefix>\n
Please drop your suggestions and clarifications to <sparekh\@age.mpg.de>\n
######################################################################################\n\n";
exit;
}

my $argLine = join(" ", @ARGV);

my ($yml, $samtoolsexc, $rscriptexc, $pigz, $zumisdir, $tmpPrefix);
BEGIN{
  ($yml, $samtoolsexc, $rscriptexc, $pigz, $zumisdir, $tmpPrefix) = @ARGV;
$yml=$ARGV[0];
$samtoolsexc=$ARGV[1];
$rscriptexc=$ARGV[2];
$pigz=$ARGV[3];
$zumisdir=$ARGV[4];
$tmpPrefix=$ARGV[5];
}

use lib "$zumisdir";
use distilReads;
use Approx;

open(YL,"$rscriptexc $zumisdir/readYaml4fqfilter.R $yml |");
my @arg=<YL>;
@arg=<YL>;
close YL;
my %argHash;
my @params=("filenames", "seqtype", "outdir", "StudyName", "num_threads",
            "BCfilter", "UMIfilter", "find_pattern", "correct_frameshift");
%argHash;
@params=("filenames", "seqtype", "outdir", "StudyName", "num_threads", "BCfilter", "UMIfilter", "find_pattern", "correct_frameshift");


for (my $i=0; $i<=$#params; $i++) {
for($i=0;$i<=$#params;$i++){
  $argHash{$params[$i]}=$arg[$i];
}

# parse the fastqfiles and make a hash with file handles as key and filename.pattern as value
# TODO: convert to `map {$_ = distilReads::argClean($argHash{$_}); chomp} @params`, or similar
#       or better: use argHash directly
my ($f, $st, $outdir, $StudyName, $num_threads, $BCfilter, $UMIfilter, $pattern, $frameshift) =
  (distilReads::argClean($argHash{"filenames"}),
   distilReads::argClean($argHash{"seqtype"}),
   distilReads::argClean($argHash{"outdir"}),
   distilReads::argClean($argHash{"StudyName"}),
   distilReads::argClean($argHash{"num_threads"}),
   distilReads::argClean($argHash{"BCfilter"}),
   distilReads::argClean($argHash{"UMIfilter"}),
   distilReads::argClean($argHash{"find_pattern"}),
   distilReads::argClean($argHash{"correct_frameshift"}));
$f = distilReads::argClean($argHash{"filenames"});
$st = distilReads::argClean($argHash{"seqtype"});
$outdir = distilReads::argClean($argHash{"outdir"});
$StudyName = distilReads::argClean($argHash{"StudyName"});
$num_threads = distilReads::argClean($argHash{"num_threads"});
$BCfilter = distilReads::argClean($argHash{"BCfilter"});
$UMIfilter = distilReads::argClean($argHash{"UMIfilter"});
$pattern = distilReads::argClean($argHash{"find_pattern"});
$frameshift = distilReads::argClean($argHash{"correct_frameshift"});


#/data/share/htp/Project_mcSCRB-seqPaper/PEG_May2017/demult_HEK_r1.fq.gz; /data/share/htp/Project_mcSCRB-seqPaper/PEG_May2017/demult_HEK_r2.fq.gz;ACTGCTGTA
#if find_pattern exists, readYaml4fqfilter returns  "ATTGCGCAATG character(0) character(0)"

chomp($f);
chomp($st);
@@ -58,83 +58,74 @@ chomp($BCfilter);
chomp($UMIfilter);
chomp($pattern);
chomp($frameshift);
my $isPass="pass";
$isPass="pass";

my $outbcstats = "$outdir/zUMIs_output/.tmpMerge/$StudyName.$tmpPrefix.BCstats.txt";
my $outbam = "$outdir/zUMIs_output/.tmpMerge/$StudyName.$tmpPrefix.filtered.tagged.bam";
$outbcstats = "$outdir/zUMIs_output/.tmpMerge/$StudyName.$tmpPrefix.BCstats.txt";
$outbam = "$outdir/zUMIs_output/.tmpMerge/$StudyName.$tmpPrefix.filtered.tagged.bam";

# Make and open all the file handles
my %file_handles = distilReads::makeFileHandles($f,$st,$pattern,$frameshift);
%file_handles = distilReads::makeFileHandles($f,$st,$pattern,$frameshift);

# get all the filehandles in @keys
my @keys = sort(keys %file_handles);
@keys = sort(keys %file_handles);

for (my $i=0; $i<=$#keys; $i++) {
  my $fhName = $keys[$i];
  my $fh = $file_handles{$fhName}{"handle"};
  my @fp = split(":",$file_handles{$fhName}{"id"});
for($i=0;$i<=$#keys;$i++){
  $fh = $keys[$i];
  @fp = split(":",$file_handles{$fh});

  if ($fp[0] =~ /\.gz$/) { # check file name
    my $oriF = $fp[0];
    my $oriBase = `basename $oriF .gz`;
  if ($fp[0] =~ /\.gz$/) {
		$oriF = $fp[0];
		$oriBase = `basename $oriF .gz`;
    chomp($oriBase);

		#change the file name to temporary prefix for its chunk
    my $chunk = "$outdir/zUMIs_output/.tmpMerge/$oriBase$tmpPrefix.gz";
    open my $fhTmp, '-|', $pigz, '-dc', $chunk || die "Couldn't open file ".$chunk.
      ". Check permissions!\n Check if it is differently zipped then .gz\n\n";
    $file_handles{$fhName}{"handle"} = $fhTmp;
		$chunk = "$outdir/zUMIs_output/.tmpMerge/$oriBase$tmpPrefix.gz";
    open $fh, '-|', $pigz, '-dc', $chunk || die "Couldn't open file ".$chunk.". Check permissions!\n Check if it is differently zipped then .gz\n\n";
  }else {
    my $oriF = $fp[0];
    my $oriBase = `basename $oriF'`;

		$oriF = $fp[0];
		$oriBase = `basename $oriF'`;
		#change the file name to temporary prefix for its chunk
    my $chunk = "$outdir/zUMIs_output/.tmpMerge/$oriBase$tmpPrefix.gz";
    open my $fhTmp, "<", $chunk || die "Couldn't open file ".$chunk.
      ". Check permissions!\n Check if it is differently zipped then .gz\n\n";
    $file_handles{$fhName}{"handle"} = $fhTmp;
		$chunk = "$outdir/zUMIs_output/.tmpMerge/$oriBase$tmpPrefix.gz";

    open $fh, "<", $chunk || die "Couldn't open file ".$chunk.". Check permissions!\n Check if it is differently zipped then .gz\n\n";
  }
}

my $total = 0;
my $filtered = 0;
my $discarded = 0;
my $phredoffset;
my %bclist;
$total = 0;
$filtered = 0;
%bclist;

open(BCBAM,"| $samtoolsexc view -Sb - > $outbam");

# First file handle to start the while loop for the first file
my $fhName1 = $keys[0];
my $fh1 = $file_handles{$fhName1}{"handle"};
my @fp1 = split(":",$file_handles{$fhName1}{"id"});
my $count = 0;
my $printedHeader = 0; # false
$fh1 = $keys[0];
@fp1 = split(":",$file_handles{$fh1});
$count = 0;
# reading the first file while others are processed in parallel within
while(<$fh1>){
  $total++;
  my $rid=$_;
  my $rseq=<$fh1>;
  my $qid=<$fh1>;
  my $qseq=<$fh1>;
  my $p1 = $fp1[1];
  my $p2 = $fp1[2];
  my $p3 = $fp1[3];
  my $ss3 = "yespattern";
  my $mcrseq;
  my $checkpattern;
  my $goahead;
  #This block checks if the read should have certain pattern
  $rid=$_;
	$rseq=<$fh1>;
	$qid=<$fh1>;
	$qseq=<$fh1>;
	$p1 = $fp1[1];
  $p2 = $fp1[2];
  $p3 = $fp1[3];
  $ss3 = "yespattern";
#$flag = 0;
#This block checks if the read should have certian pattern
  if($p2 =~ /^character/){
    $mcrseq = $rseq;
    $checkpattern = $rseq;
  } else {
  }
  else{
    $mcrseq = $rseq;
    $checkpattern = $p2;
  }

  # This block checks if smart-seq3 pattern is present and if it is found in the reads
  # If it is smart-seq3 pattern in the YAML file but not found in the read then the read
  #   is retained as full cDNA read where UMI is null.
  # If it is smart-seq3 pattern in the YAML file but not found in the read then the read is retained as full cDNA read where UMI is null.
  if($p2 eq "ATTGCGCAATG"){
    $a = substr($mcrseq,0,length($p2));
    if(Approx::amatch($checkpattern, [ 1 ],$a)){
@@ -146,9 +137,12 @@ while (<$fh1>) {
    }
  }



#This block checks if the read should be read corrected for frameshift in BC pattern
  if($p3 !~ /^character/){
    my @bla = split($p3,$rseq);
    @bla = split($p3,$rseq);
    #next if($#bla != 1);
    if($#bla != 1){
      $isPass = "fail";
    }else{
@@ -156,7 +150,7 @@ while (<$fh1>) {
    }
    #print $isPass,"\n";
    if($isPass ne "fail"){
      my $qst = length($rseq) - length($isPass);
      $qst = length($rseq) - length($isPass);
      $qseq = substr($qseq, $qst);
      $rseq = $isPass;
    }
@@ -166,41 +160,39 @@ while (<$fh1>) {
    $count=1;
    $phredoffset = distilReads::checkPhred($qseq);
  }
  my ($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2, $cdc, $lay) =
    ("","","","","","","","",0,"SE");
  ($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2, $cdc, $lay) = ("","","","","","","","",0,"SE");

  if($isPass ne "fail"){
    ($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2, $cdc, $lay) =
      distilReads::makeSeqs($rseq,$qseq,$p1,$cdc,$ss3);
  }

  for (my $i=1; $i<=$#keys; $i++) {
    my $fhName = $keys[$i];
    my $fh = $file_handles{$fhName}{"handle"};
    my @fp = split(":",$file_handles{$fhName}{"id"});
    my $rid1=<$fh>;
    my $rseq1=<$fh>;
    my $qid1=<$fh>;
    my $qseq1=<$fh>;
    my $p = $fp[1];
    my $pf = $fp[2];
    my $pf2 = $fp[3];
    ($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2, $cdc, $lay) = distilReads::makeSeqs($rseq,$qseq,$p1,$cdc,$ss3);
  }

	for($i=1;$i<=$#keys;$i++){
    $fh = $keys[$i];
    @fp = split(":",$file_handles{$fh});
  #  $flag = 0;
		$rid1=<$fh>;
		$rseq1=<$fh>;
		$qid1=<$fh>;
		$qseq1=<$fh>;
    $p = $fp[1];
    $pf = $fp[2];
    $pf2 = $fp[3];
    $ss3 = "yespattern";

    #This block checks if the read should have certian pattern
      if($pf =~ /^character/){
        $mcrseq = $rseq1;
        $checkpattern = $rseq1;
    } else {
      }
      else{
        $mcrseq = $rseq1;
        $checkpattern = $pf;
      }

      # This block checks if smart-seq3 pattern is present and if it is found in the reads
    # If it is smart-seq3 pattern in the YAML file but not found in the read then the read
    #   is retained as full cDNA read where UMI is null.
      # If it is smart-seq3 pattern in the YAML file but not found in the read then the read is retained as full cDNA read where UMI is null.
      if($pf eq "ATTGCGCAATG"){
      my $af = substr($mcrseq,0,length($pf));
        $af = substr($mcrseq,0,length($pf));
        if(Approx::amatch($checkpattern, [ 1 ],$af)){
          $ss3 = "yespattern";
          $checkpattern = $pf;
@@ -212,7 +204,7 @@ while (<$fh1>) {

    #This block checks if the read should be read corrected for frameshift in BC pattern
      if($pf2 !~ /^character/){
      my @bla = split($pf2,$rseq1);
        @bla = split($pf2,$rseq1);
        #next if($#bla != 1);
        if($#bla != 1){
          $isPass = "fail";
@@ -221,14 +213,14 @@ while (<$fh1>) {
        }
        #print $isPass,"\n";
        if($isPass ne "fail"){
        my $qst = length($rseq1) - length($isPass);
          $qst = length($rseq1) - length($isPass);
          $qseq1 = substr($qseq1, $qst);
          $rseq1 = $isPass;
        }
      }

    my @c = split(/\/|\s/,$rid);
    my @b = split(/\/|\s/,$rid1);
    @c = split(/\/|\s/,$rid);
    @b = split(/\/|\s/,$rid1);
    if($c[0] ne $b[0]){
      print $c[0],"\n",$b[0],"\n";
      print "ERROR! Fastq files are not in the same order.\n Make sure to provide reads in the same order.\n\n";
@@ -236,47 +228,39 @@ while (<$fh1>) {
    }

    # get the BC, UMI and cDNA sequences from all the given fastq files and concatenate according to given ranges
    my ($bcseq1, $bcqseq1, $ubseq1, $ubqseq1, $cseq1, $cqseq1, $cseq2, $cqseq2, $cdc, $lay);
    if($isPass ne "fail"){
      ($bcseq1, $bcqseq1, $ubseq1, $ubqseq1, $cseq1, $cqseq1, $cseq2, $cqseq2, $cdc, $lay) =
        distilReads::makeSeqs($rseq1,$qseq1,$p,$cdc,$ss3);
    } else {
      ($bcseq1, $bcqseq1, $ubseq1, $ubqseq1, $cseq1, $cqseq1, $cseq2, $cqseq2, $cdc, $lay) =
        ("","","","","","","","",0,"SE");
      ($bcseq1, $bcqseq1, $ubseq1, $ubqseq1, $cseq1, $cqseq1, $cseq2, $cqseq2, $cdc, $lay) = distilReads::makeSeqs($rseq1,$qseq1,$p,$cdc,$ss3);
    }
    else
    {
      ($bcseq1, $bcqseq1, $ubseq1, $ubqseq1, $cseq1, $cqseq1, $cseq2, $cqseq2, $cdc, $lay) = ("","","","","","","","",0,"SE");
    }

    # concatenate according to given ranges with other files
    ($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2) =
      ($bcseq.$bcseq1, $bcqseq.$bcqseq1, $ubseq.$ubseq1, $ubqseq.$ubqseq1,
       $cseqr1.$cseq1, $cqseqr1.$cqseq1, $cseqr2.$cseq2, $cqseqr2.$cqseq2);
    ($bcseq, $bcqseq, $ubseq, $ubqseq, $cseqr1, $cqseqr1, $cseqr2, $cqseqr2) = ($bcseq.$bcseq1, $bcqseq.$bcqseq1, $ubseq.$ubseq1, $ubqseq.$ubqseq1, $cseqr1.$cseq1, $cqseqr1.$cqseq1, $cseqr2.$cseq2, $cqseqr2.$cqseq2);
	}
#next if($flag = 1); # if correct_Frame is present and the pattern is not found in the read

    # Check the quality filter thresholds given
  my @bcthres = split(" ",$BCfilter);
  my @umithres = split(" ",$UMIfilter);
    @bcthres = split(" ",$BCfilter);
    @umithres = split(" ",$UMIfilter);

  if (($bcthres[0] < 0) || (scalar(@bcthres) > length($bcseq))) {
    if(($bcthres[0] < 0) || (length($bcthres) > length($bcseq))) {
      $bcthres[0] = 1;
    }
  if (($umithres[0] < 0) || (scalar(@umithres) > length($ubseq))) {
    if(($umithres[0] < 0) || (length($umithres) > length($ubseq))) {
      $umithres[0] = 1;
    }
    # map to the correct phredoffset and get the number of bases under given quality threshold
  my @bquals = map {$_ - $phredoffset} unpack "C*", $bcqseq;
  my @mquals = map {$_ - $phredoffset} unpack "C*", $ubqseq;
  my $btmp = grep {$_ < $bcthres[1]} @bquals;
  my $mtmp = grep {$_ < $umithres[1]} @mquals;

  ## Check if it is a smartseq3 pattern. If so, allow for approximate
  ## match with 1 base distance. If it is not a smartseq3 pattern,
  ## check if the read starts with the pattern at all. The $goahead
  ## variable decides if the read passes the quality threshold of a
  ## pattern.
  # IF the read should not have any pattern, the $checkpattern is
  # equal to $mcrseq so $goahead variable will stay "yes"
    @bquals = map {$_ - $phredoffset} unpack "C*", $bcqseq;
    @mquals = map {$_ - $phredoffset} unpack "C*", $ubqseq;
    $btmp = grep {$_ < $bcthres[1]} @bquals;
    $mtmp = grep {$_ < $umithres[1]} @mquals;

## Check if it is a smartseq3 pattern. If so, allow for approximate match with 1 base distance. If it is not a smartseq3 pattern, check if the read starts with the pattern at all. The $goahead variable decides if the read passes the quality threshold of a pattern.
# IF the read should not have any pattern, the $checkpattern is equal to $mcrseq so $goahead variable will stay "yes"
    if($checkpattern eq "ATTGCGCAATG"){
    my $ac = substr($mcrseq,0,length($checkpattern));
      $ac = substr($mcrseq,0,length($checkpattern));
      if(Approx::amatch($checkpattern, [ 1 ],$ac)){
        $goahead = "yes";
      }else{
@@ -290,54 +274,38 @@ while (<$fh1>) {
      }
    }
    # print out only if above the quality threshold
  # TODO: convert string checks to logical comparisons

     #if(($btmp < $bcthres[0]) && ($mtmp < $umithres[0]) && ( Approx::amatch($checkpattern, [ 1 ],$mcrseq) ) && ($isPass ne "fail")){
     #if(($btmp < $bcthres[0]) && ($mtmp < $umithres[0]) && ($mcrseq =~ m/^$checkpattern/) && ($isPass ne "fail")){
     #if(($btmp < $bcthres[0]) && ($mtmp < $umithres[0])){
     if(($btmp < $bcthres[0]) && ($mtmp < $umithres[0]) && ($goahead eq "yes") && ($isPass ne "fail")){
      chomp($rid);
    my $ridtmp;

      if($rid =~ m/^\@.*\s/){
        $rid =~ m/^\@(.*)\s/;
        $ridtmp = $1;
    } else {
      }
      else{
        $rid =~ m/^\@(.*)/;
        $ridtmp = $1;
      }

      $filtered++;
      $bclist{$bcseq}++;
    if(!$printedHeader){
      print(BCBAM join("\t", ("@"."PG","ID:zUMIs-fqfilter","PN:zUMIs-fqfilter", "VN:2",
                              "CL:fqfilter_v2.pl ${argLine}")) . "\n");
      $printedHeader = 1; # true
    }
      #print $lay,"\n";
      if($lay eq "SE"){
      print BCBAM $ridtmp,"\t4\t*\t0\t0\t*\t*\t0\t0\t",
        $cseqr1,"\t",$cqseqr1,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
        print BCBAM $ridtmp,"\t4\t*\t0\t0\t*\t*\t0\t0\t",$cseqr1,"\t",$cqseqr1,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
      }else{
      print BCBAM $ridtmp,"\t77\t*\t0\t0\t*\t*\t0\t0\t",
        $cseqr1,"\t",$cqseqr1,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
      print BCBAM $ridtmp,"\t141\t*\t0\t0\t*\t*\t0\t0\t",
        $cseqr2,"\t",$cqseqr2,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
        print BCBAM $ridtmp,"\t77\t*\t0\t0\t*\t*\t0\t0\t",$cseqr1,"\t",$cqseqr1,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
        print BCBAM $ridtmp,"\t141\t*\t0\t0\t*\t*\t0\t0\t",$cseqr2,"\t",$cqseqr2,"\tBC:Z:",$bcseq,"\tUB:Z:",$ubseq,"\tQB:Z:",$bcqseq,"\tQU:Z:",$ubqseq,"\n";
      }
  } else {
    $discarded++;
    }
}

close BCBAM;
for (my $i=0; $i<=$#keys; $i++) {
  my $fhTmp = $file_handles{$keys[$i]}{"handle"};
  close $fhTmp;
}

#if ($filtered == 0) {
#  print(STDERR "No reads emitted; please check the parameter file and the warnings above.\n");
#  exit;
#} else {
#  print(STDERR "reads kept: $filtered; reads discarded: $discarded\n");
#}
for($i=0;$i<=$#keys;$i++){  close $keys[$i]; }

open BCOUT, '>', $outbcstats || die "Couldn't open file ".$outbcstats.". Check permissions!\n";
foreach my $bc (keys %bclist) {
foreach $bc (keys %bclist){
  print BCOUT $bc,"\t",$bclist{$bc},"\n";
}
close BCOUT;
+5 −0
Original line number Diff line number Diff line
@@ -129,12 +129,14 @@ if(opt$counting_opts$Ham_Dist == 0){

  tmpbamfile <- outbamfile
  outbamfile <- paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.sorted.bam")
  print(Sys.time())
  print("Coordinate sorting intermediate bam file...")
  sort_cmd <- paste0(samtoolsexc," sort -O 'BAM' -@ ",opt$num_threads," -m ",mempercpu,"G -o ",outbamfile," ",tmpbamfile)
  system(sort_cmd)
  index_cmd <- paste(samtoolsexc,"index -@",opt$num_threads,outbamfile)
  system(index_cmd)
  system(paste0("rm ",tmpbamfile))
  print(Sys.time())
  
  for(i in unique(bccount$chunkID)){
    print( paste( "Hamming distance collapse in barcode chunk", i, "out of",length(unique(bccount$chunkID)) ))
@@ -152,12 +154,15 @@ if(opt$counting_opts$Ham_Dist == 0){
  outbamfile <- correct_UB_tags(bccount, samtoolsexc)
  sortbamfile <-paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.UBcorrected.sorted.bam")
}
print(Sys.time())
print("Coordinate sorting final bam file...")
sort_cmd <- paste0(samtoolsexc," sort -O 'BAM' -@ ",opt$num_threads," -m ",mempercpu,"G -o ",sortbamfile," ",outbamfile)
system(sort_cmd)
index_cmd <- paste(samtoolsexc,"index -@",opt$num_threads,sortbamfile)
system(index_cmd)
system(paste0("rm ",outbamfile))
print(Sys.time())


##########################################
#set Downsampling ranges
+2 −2
Original line number Diff line number Diff line
@@ -110,7 +110,7 @@ if(inp$counting_opts$twoPass==TRUE){
#finally, run STAR
if(num_star_instances>1 & inp$which_Stage == "Filtering"){
  map_tmp_dir <- paste0(inp$out_dir,"/zUMIs_output/.tmpMap/")
  dir.create(path = map_tmp_dir)
  dir.create(path = map_tmp_dir,showWarnings = FALSE)
  input_split <- split(filtered_bams, ceiling(seq_along(filtered_bams) / ceiling(length(filtered_bams) / num_star_instances)))
  input_split <- sapply(input_split, paste0, collapse = ",")
  STAR_preset <- STAR_command
@@ -130,7 +130,7 @@ if(num_star_instances>1 & inp$which_Stage == "Filtering"){
  out_txbams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Aligned.toTranscriptome.out.bam"), full = TRUE)
  merge_txbams <- paste(inp$samtools_exec,"cat -o",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.toTranscriptome.out.bam"),paste(out_txbams, collapse = " "))
  system(paste(merge_logs,"&",merge_bams,"&",merge_txbams,"& wait"))
  system(paste0("rm ", map_tmp_dir, "tmp.", inp$project, ".*"))
  system(paste0("rm -r ", map_tmp_dir, "tmp.", inp$project, ".*"))
}else{
  STAR_command <- paste(STAR_command,
    "--readFilesIn",paste0(filtered_bams,collapse=","),
Loading