Commit a504f399 authored by Christoph's avatar Christoph
Browse files

zUMIs with combined in/ex bam and umitools alpha commit

parent 03754453
Loading
Loading
Loading
Loading

Approx.pm

0 → 100755
+1151 −0

File added.

Preview size limit exceeded, changes collapsed.

LICENSE

0 → 100755
+674 −0

File added.

Preview size limit exceeded, changes collapsed.

checkyaml.R

0 → 100755
+137 −0
Original line number Diff line number Diff line
#!/usr/bin/env Rscript

suppressMessages(require(yaml))
y<-commandArgs(trailingOnly = T)

inp<-try(read_yaml(y),silent =  T)

e=0

if(grepl("try-error",class(inp))) {
  e=1
  print(e)
  stop()
} 

       
## Check if mandatory parameters have arguments
if(is.null(inp$project)) {
  e=1 
  print("Please provide a project name. It can not be NULL.")
}
if(is.null(inp$reference$STAR_index)) {
  e=1 
  print("Please provide path to STAR index directory. It can not be NULL.")
}
if(is.null(inp$reference$GTF_file))  {
  e=1 
  print("Please provide path to GTF file. It can not be NULL.")
}

if(is.null(inp$out_dir))  {
  e=1
  print("Please provide path to output directory. It can not be NULL.")
}

if(is.null(unlist(inp$sequence_files)))  {
  e=1 
  print("You need to provide atleast two input fastq files and their base definitions.")
}

lapply(inp$sequence_files, function(x) {
  if(is.null(x$name) | is.null(x$base_definition)){
    e=1
    print("You can not leave out file name or base definition.")
  }
} )


print(
  paste(sapply(inp$sequence_files, function(x) 
    if(is.null(unlist(x$base_definition)))  {
      e=1 
      print("You need to provide base definition for all input files")
      }else { print("")}
    )))

print(
  paste(
    lapply(seq_along(inp$sequence_files), 
           function(x) {
             if(is.null(unlist(inp$sequence_files[[x]]$name)))  {
               e=1 
               print(paste(print(names(inp$sequence_files[x])),": You need to provide path to the input file")) 
               }else{ print("")}
             })
))
print(
   paste(
  lapply(seq_along(inp$sequence_files), 
         function(x) {
           if(is.null(unlist(inp$sequence_files[[x]]$base_definition))) {
             e=1 
             print(paste(print(names(inp$sequence_files[x])),": You need to provide base definitions for this input file"))}
         })
  )
)

lapply(inp$sequence_files, 
       function(x) {
         if(!is.null(x$base_definition)) {
           if(any(!grepl("^BC|^UMI|^cDNA",x$base_definition)))  {
             e=1 
             print("The base definition can only be BC/cDNA/UMI. Check if you have a typo/special characters in your base definition or you forgot to add /space/ after -. Refer to the example yaml.")
           }
         }
       })


## Check if all the file paths are correct

  lapply(inp$sequence_files, 
       function(x) {
         if(!is.null(x$name)) {
           if(!file.exists(x$name))  {
             e=1 
             print("Please check fastq file paths.")
           }
         }
       })

  if(!is.null(inp$reference$GTF_file)) {
    if(!file.exists(inp$reference$GTF_file))  {
      e=1 
      print("GTF file does not exists")
    }
  }
  if(!is.null(inp$reference$STAR_index)) {
    if(!file.exists(inp$reference$STAR_index))  {
      e=1 
      print("STAR index does not exists")
    }
  }

  if(!is.null(inp$barcodes$barcode_file)){
  if(!file.exists(inp$barcodes$barcode_file))  {
    e=1 
    print("Please check barcode list file path.")
  }
  }
 
## Some other variable's validity check 
if(!is.numeric(inp$num_threads))  {
  e=1 
  print("Number of threads should be a number.")
}
if(!inp$num_threads >= 1)  {
  e=1 
  print("Number of threads should be a number >= 1")
}

if(!grepl("Filtering|Mapping|Counting|Summarising",inp$which_Stage,ignore.case = T))  {
  e=1 
  print("which_stage argument can only be one of these terms: Filtering, Mapping, Counting, Summarising")
}

  
print(e)
 No newline at end of file

correct_BCtag.pl

0 → 100644
+54 −0
Original line number Diff line number Diff line
#!/usr/bin/perl
use warnings;

if(@ARGV != 4)
{
print
"\n#####################################################################################
Usage: perl $0 <inbam> <outbam> <BCbinmap> <samtools-executable> \n
Please drop your suggestions and clarifications to <christoph.ziegenhain\@ki.se>\n
######################################################################################\n\n";
exit;
}
BEGIN{
$inbam=$ARGV[0];
$outbam=$ARGV[1];
$binmap=$ARGV[2];
$samtoolsexc=$ARGV[3];
}

open(INBAM, "$samtoolsexc view $inbam | sed 's/BC:Z://' | " ) || die "Couldn't open file $inbam. Check permissions!\n Check if it is a bam file and it exists\n\n";
open(BCBAM,"| $samtoolsexc view -b -o $outbam -");


my %bcmap;
open(DATA, "cat $binmap | sed 's/,/\t/g' | cut -f1,3 | grep -v 'falseBC' | ") || die "Can't open $binmap ! \n";
while (<DATA>) {
  my ($raw, @fixedBC) = split(/\t/);
  $bcmap{$raw} = \@fixedBC;
}
close DATA;


while (<INBAM>) {
  $read=$_;
  chomp($read);
  @read = split(/\t/,$read);
  $thisBC = $read[11];

  if (defined($bcmap{$thisBC})) {
    #print "BC is in hash\n";
    $correctBC = $bcmap{$thisBC}[0];
    chomp($correctBC);
  }
  else {
    #print "BC is not in hash\n";
    $correctBC = $thisBC;
  }

  print BCBAM $read[0],"\t",$read[1],"\t",$read[2],"\t",$read[3],"\t",$read[4],"\t",$read[5],"\t",$read[6],"\t",$read[7],"\t",$read[8],"\t",
        $read[9],"\t",$read[10],"\t","BX:Z:",$thisBC,"\t","BC:Z:",$correctBC,"\t",$read[12],"\n";

}
close INBAM;
close BCBAM;

correct_UBtag.pl

0 → 100755
+82 −0
Original line number Diff line number Diff line
#!/usr/bin/perl
use warnings;

if(@ARGV != 4)
{
print
"\n#####################################################################################
Usage: perl $0 <inbam> <outbam> <moleculemap> <samtools-executable> \n
Please drop your suggestions and clarifications to <christoph.ziegenhain\@ki.se>\n
######################################################################################\n\n";
exit;
}
BEGIN{
$inbam=$ARGV[0];
$outbam=$ARGV[1];
$binmap=$ARGV[2];
$samtoolsexc=$ARGV[3];
}

open(BCBAM,"| $samtoolsexc view -b -o $outbam -");
open(BAM, "$samtoolsexc view -H $inbam |  " ) || die "Couldn't open file $inbam. Check permissions!\n Check if it is a bam file and it exists\n\n";
while (<BAM>) {
  print BCBAM $_;
}
close BAM;


my %ubmap; #= {};
open(DATA, "cat $binmap | sed 's/,/\t/g' | cut -f1,2,4 | grep -v 'false' | ") || die "Can't open $binmap ! \n";
while (<DATA>) {
  $line=$_;
  chomp($line);
  @splitout = split(/\t/, $line); #rawUB correctUB Gene
  $ubmap{$splitout[2]}{$splitout[0]} = $splitout[1];
}
close DATA;

open(INBAM, "$samtoolsexc view $inbam | sed 's/UB:Z:/UX:Z:/g'  |  " ) || die "Couldn't open file $inbam. Check permissions!\n Check if it is a bam file and it exists\n\n";

while (<INBAM>) {
  $read=$_;
  chomp($read);
  @readarr = split(/\t/,$read);
  my @UXmatches = grep { /UX:Z:/ } @readarr;
  $UXtag = $UXmatches[0];
  $UXtag =~ s/UX:Z://g;

  my @GEmatches = grep { /XT:Z:/ } @readarr;

  if (!@GEmatches == 0){
    #print "GE found";
    $GEtag = $GEmatches[0];
    $GEtag =~ s/XT:Z://g;

    if (defined($ubmap{$GEtag})) {
      #print "GE found";
      if(defined($ubmap{$GEtag}{$UXtag})){
        #print "UX within this Gene found\n";
        $correctUB = $ubmap{$GEtag}{$UXtag};#[0];
        #print $correctUB
      }
      else {
        #print "Taking old UX code 1";
        $correctUB = $UXtag;
      }
    }
    else {
      #print "Taking old UX code 2";
      $correctUB = $UXtag;
    }
  }
  else {
    #print "Taking old UX code 3";
    $correctUB = $UXtag;
  }


  print BCBAM $read,"\t","UB:Z:",$correctUB,"\n";

}
close INBAM;
close BCBAM;
Loading