Commit 9067aef6 authored by ziegenhain@bio.lmu.de's avatar ziegenhain@bio.lmu.de
Browse files

zUMIs0.0.4 with plate BC support

parent fc0a1a37
Loading
Loading
Loading
Loading
+4 −2
Original line number Diff line number Diff line
@@ -10,12 +10,14 @@ You can read more about zUMIs in our [biorxiv preprint](https://www.biorxiv.org/

You can glance through zUMIs in [zUMIs poster](https://github.com/sdparekh/zUMIs/blob/master/zUMIs_GI2017_poster.pdf)!

## Releases
## Releases/Changelog
23 Feb 2018: [zUMIs.0.0.3 released](https://github.com/sdparekh/zUMIs/releases/tag/zUMIs.0.0.4).
Added support for plate barcodes with input of an additional barcode fastq file (eg. Illumina i7 index read). Addition of version number in zUMIs-master. Parameters are printed in a .zUMIs_run.txt file for each call.

18 Feb 2018: [zUMIs.0.0.3 released](https://github.com/sdparekh/zUMIs/releases/tag/zUMIs.0.0.3).
Switched support to the new Rsubread version and data format. Furthermore to compensate sequencing/PCR errors, zUMIs now features UMI correction using Hamming distance and binning of adjacent cell barcodes.

You can find the older version of zUMIs [here](https://github.com/sdparekh/zUMIs/releases/).
You can find the older versions of zUMIs [here](https://github.com/sdparekh/zUMIs/releases/).

## Installation and Usage

+53 −14
Original line number Diff line number Diff line
@@ -4,11 +4,11 @@
# Author: Swati Parekh
# Contact: parekh@bio.lmu.de or ziegenhain@bio.lmu.de or hellmann@bio.lmu.de

if(@ARGV != 12)
if(@ARGV != 14)
{
print
"\n#####################################################################################
Usage: perl $0 <barcode-Read.fq.gz> <cDNA-Read.fq.gz> <cellbc_threshold> <Cellbc_Qual_threshold> <umi_threshold> <UMIbc_Qual_threshold> <Cellbc_range> <UMI_range> <Threads> <StudyName> <Outdir> <pigz-executable> \n
Usage: perl $0 <barcode-Read.fq.gz> <cDNA-Read.fq.gz> <plateBC-read.fq.gz> <cellbc_threshold> <Cellbc_Qual_threshold> <umi_threshold> <UMIbc_Qual_threshold> <Cellbc_range> <PlateBC_range> <UMI_range> <Threads> <StudyName> <Outdir> <pigz-executable> \n
Explanation of parameter:

barcode-Read.fq.gz	- Input barcode reads fastq file name.
@@ -30,16 +30,18 @@ exit;

$bcread=$ARGV[0];
$cdnaread=$ARGV[1];
$bnbases=$ARGV[2];
$bqualthreshold=$ARGV[3];
$mnbases=$ARGV[4];
$mqualthreshold=$ARGV[5];
$bcrange=$ARGV[6];
$mcrange=$ARGV[7];
$threads=$ARGV[8];
$study=$ARGV[9];
$outdir=$ARGV[10];
$pigz=$ARGV[11];
$pbcread=$ARGV[2];
$bnbases=$ARGV[3];
$bqualthreshold=$ARGV[4];
$mnbases=$ARGV[5];
$mqualthreshold=$ARGV[6];
$bcrange=$ARGV[7];
$pbcrange=$ARGV[8];
$mcrange=$ARGV[9];
$threads=$ARGV[10];
$study=$ARGV[11];
$outdir=$ARGV[12];
$pigz=$ARGV[13];

@b = split("-",$bcrange);
@m = split("-",$mcrange);
@@ -47,23 +49,37 @@ $bs = $b[0] - 1;
$ms = $m[0] - 1;
$bl = $b[1]-$b[0]+1;
$ml = $m[1]-$m[0]+1;
if($pbcread ne "NA") {
	@p = split("-",$pbcrange);
	$ps = $p[0] - 1;
	$pl = $p[1]-$p[0]+1;
}



$bcreadout = $outdir."/".$study.".barcodelist.filtered.sam";
$bcreadoutfull = $outdir."/".$study.".barcoderead.filtered.fastq";
if($pbcread ne "NA") {$pbcreadoutfull = $outdir."/".$study.".platebarcoderead.filtered.fastq";}
$cdnareadout = $outdir."/".$study.".cdnaread.filtered.fastq";

if ($bcread =~ /\.gz$/) {
open BCF, '-|', $pigz, '-dc', $bcread || die "Couldn't open file $bcread. Check permissions!\n Check if it is differently zipped then .gz\n\n";
if($pbcread ne "NA") {open PBCF, '-|', $pigz, '-dc', $pbcread || die "Couldn't open file $pbcread. Check permissions!\n Check if it is differently zipped then .gz\n\n";}
open CDF, '-|', $pigz, '-dc', $cdnaread || die "Couldn't open file $cdnaread. Check permissions!\n Check if it is differently zipped then .gz\n\n";
}
else {
open BCF, "<", $bcread || die "Couldn't open file $bcread. Check permissions!\n Check if it is differently zipped then .gz\n\n";
if($pbcread ne "NA") {open PBCF, "<", $pbcread || die "Couldn't open file $pbcread. Check permissions!\n Check if it is differently zipped then .gz\n\n";}
open CDF, "<", $cdnaread || die "Couldn't open file $cdnaread. Check permissions!\n Check if it is differently zipped then .gz\n\n";
}

open BCOUT, ">", $bcreadout || die "Couldn't open file $bcreadout to write\n\n";;
open CDOUT, ">", $cdnareadout || die "Couldn't open file $cdnareadout to write\n\n";;
open BCOUTFULL, ">", $bcreadoutfull || die "Couldn't open file $bcreadoutfull to write\n\n";;
if($pbcread ne "NA") {open PBCOUTFULL, ">", $pbcreadoutfull || die "Couldn't open file $pbcreadoutfull to write\n\n";;}




$count=0;
$total=0;
@@ -75,7 +91,13 @@ $total++;
	$brseq=<BCF>;
	$bqid=<BCF>;
	$bqseq=<BCF>;

	if($pbcread ne "NA"){
		$pbrid=<PBCF>;
		$pbrseq=<PBCF>;
		$pbqid=<PBCF>;
		$pbqseq=<PBCF>;
		$pbcqual = substr($pbqseq,$ps,$pl);
	}
	if($count==0){
		$count=1;
		@quals = map {$_} unpack "C*", $bqseq;
@@ -95,14 +117,26 @@ $total++;

	if($c[0] eq $b[0]){
		@bquals = map {$_ - $offset} unpack "C*", $bcqual;
		#@pbquals = map {$_ - $offset} unpack "C*", $pbcqual;
		@mquals = map {$_ - $offset} unpack "C*", $mcqual;
		$btmp = grep {$_ < $bqualthreshold} @bquals;
		#$pbtmp = grep {$_ < $bqualthreshold} @pbquals;
		$mtmp = grep {$_ < $mqualthreshold} @mquals;

		if(($btmp < $bnbases) && ($mtmp < $mnbases)){
		$filtered++;
			$bcseq = substr($brseq,$bs,$bl);
			if($pbcread ne "NA") {$pbcseq = substr($pbrseq,$ps,$pl);}
			$mcseq = substr($brseq,$ms,$ml);

			$brid =~ m/^@(.*)\s/; chomp($brseq);
			if($pbcread ne "NA") {
				print BCOUT $1,"\t4\t*\t0\t0\t*\t*\t0\t0\t",$pbcseq,$bcseq,$mcseq,"\t",$pbcqual,$bcqual,$mcqual,"\n";
				print PBCOUTFULL $pbrid,$pbrseq,"\n",$pbqid,$pbqseq;
			}
			else {
				print BCOUT $1,"\t4\t*\t0\t0\t*\t*\t0\t0\t$brseq\t$bqseq";
			}
			print BCOUTFULL $brid,$brseq,"\n",$bqid,$bqseq;
			print CDOUT $crid,$crseq,$cqid,$cqseq;
		}
@@ -118,6 +152,11 @@ close CDF;
close BCOUT;
close CDOUT;
close BCOUTFULL;
if($pbcread ne "NA") {
	close PBCF;
	close PBCOUTFULL;
	`$pigz -f -p $threads $pbcreadoutfull`;
}

print "Raw reads: $total \nFiltered reads: $filtered \n\n";

+1 −1
Original line number Diff line number Diff line
@@ -206,7 +206,7 @@ makeGEprofile <- function(abamfile,ubamfile,bcfile,safannot,ncores,stra,bcstart,
      umiseq <- sort(umiseq)
      uc     <- data.frame(us = umiseq) %>% dplyr::count(us) # normal UMI counts

      if(length(uc$us)>1 && length(uc$us)<10000){ #prevent use of > 100Gb RAM
      if(length(uc$us)>1 && length(uc$us)<100000){ #prevent use of > 100Gb RAM
        umi <- stringdist::stringdistmatrix(uc$us,method="hamming",useNames = "strings",nthread=1) %>% #only 1 core for each multidplyr worker
          broom::tidy() %>%
          dplyr::filter( distance <= edit  ) %>% # only remove below chosen dist
+3 −1
Original line number Diff line number Diff line
@@ -13,6 +13,8 @@ cq="${10}"
t="${11}"
d="${12}"
pigz="${13}"
pbcfastq="${14}"
pbcrange="${15}"

# MAKING THE HEADER
echo '#!/bin/bash' >$o/$sn.prep.sh
@@ -23,6 +25,6 @@ echo '#SBATCH --cpus-per-task='$t >>$o/$sn.prep.sh
echo '#SBATCH --workdir='$o >>$o/$sn.prep.sh
echo '#SBATCH --mem=1000' >>$o/$sn.prep.sh

echo "srun perl $d/fqfilter.pl $f1 $f2 $cq $cbq $mq $mbq $xc $xm $t $sn $o $pigz" >>$o/$sn.prep.sh
echo "srun perl $d/fqfilter.pl $f1 $f2 $pbcfastq $cq $cbq $mq $mbq $xc $pbcrange $xm $t $sn $o $pigz" >>$o/$sn.prep.sh

sbatch $o/$sn.prep.sh > $o/$sn.preparejobid.txt
+43 −9
Original line number Diff line number Diff line
@@ -3,7 +3,7 @@
# Pipeline to run UMI-seq analysis from fastq to read count tables.
# Authors: Swati Parekh &  Christoph Ziegenhain
# Contact: parekh@bio.lmu.de or christoph.ziegenhain@ki.se or hellmann@bio.lmu.de

vers=0.0.4
function check_opts() {
    value=$1
    name=$2
@@ -64,6 +64,8 @@ Make sure you have 3-4 times more disk space to your input fastq files.
					This pipeline works based on one hit per read. Therefore, please do not report more multimapping hits. Default: "".
	-H  <HammingDistance>    : Hamming distance collapsing of UMI sequences. Default: 0.
	-B  <BarcodeBinning>     : Hamming distance binning of close cell barcode sequences. Default: 0.
        -T  <PlateBC fastq>      : Fastq file for plate barcode read. Default: NA
        -U  <PlateBC range>      : Barcode range for plate barcode read (e.g. 1-8). Default: NA

## Program paths ##
	-o  <outputdir>          : Where to write output bam. Default: working directory.
@@ -97,6 +99,8 @@ Make sure you have 3-4 times more disk space to your input fastq files.
	-F  <gel-barcode2 fastq> : Provide the second half of gel barcode + UMI read <-F> here. Default: NA.
	-L  <library barcode fastq> : Provide the library barcode read here. Default: NA

zUMIs version $vers

EOF
}

@@ -130,8 +134,10 @@ xcrange2=0-0
nreads=100
ham=0
XCbin=0
pbcfastq=NA
pbcrange=NA

while getopts ":R:S:f:r:g:o:a:t:s:c:m:l:b:n:N:q:Q:z:u:x:e:p:i:d:X:A:w:j:F:C:y:Y:L:P:V:H:B:h" options; do #Putting <:> between keys implies that they can not be called without an argument.
while getopts ":R:S:f:r:g:o:a:t:s:c:m:l:b:n:N:q:Q:z:u:x:e:p:i:d:X:A:w:j:F:C:y:Y:L:P:V:H:B:U:T:h" options; do #Putting <:> between keys implies that they can not be called without an argument.
  case $options in
  R ) isslurm=$OPTARG;;
  S ) isStats=$OPTARG;;
@@ -170,6 +176,8 @@ while getopts ":R:S:f:r:g:o:a:t:s:c:m:l:b:n:N:q:Q:z:u:x:e:p:i:d:X:A:w:j:F:C:y:Y:
  V ) Rexc=$OPTARG;;
  H ) ham=$OPTARG;;
  B ) XCbin=$OPTARG;;
  T ) pbcfastq=$OPTARG;;
  U ) pbcrange=$OPTARG;;
  h ) usage
          exit 1;;
  \? ) echo -e "\n This key is not available! Please check the usage again: -$OPTARG"
@@ -250,6 +258,8 @@ echo -e "\n\n You provided these parameters:
 # bases below phred in UMI:	$molbcbase
 Hamming Distance (UMI):	$ham
 Hamming Distance (CellBC):	$XCbin
 Plate Barcode Read:    $pbcfastq
 Plate Barcode range:   $pbcrange
 Barcodes:			$barcodes
 zUMIs directory:		$zumisdir
 STAR executable		$starexc
@@ -263,7 +273,8 @@ echo -e "\n\n You provided these parameters:
 Barcode read2(STRT-seq):	$bcread2
 Barcode read2 range(STRT-seq):	$xcrange2
 Bases(G) to trim(STRT-seq):	$BaseTrim
 Subsampling reads:		$subsampling \n\n"
 Subsampling reads:		$subsampling \n\n
 zUMIs version $vers \n\n" | tee "$sname.zUMIs_run.txt"

#create output folders
[ -d $outdir/zUMIs_output/ ] || mkdir $outdir/zUMIs_output/
@@ -279,6 +290,29 @@ if [[ "$isCustomFASTQ" != "no" ]] ; then
fi
#Submit all the jobs
if [[ "$isslurm" == "yes" ]] ; then
  if [[ "$pbcfastq" != "NA" ]] ; then
    tmpa=`echo $pbcrange | cut -f1 -d '-'`
    tmpb=`echo $pbcrange | cut -f2 -d '-'`
    pbcl=`expr $tmpa + $tmpb - 1`
    tmpa=`echo $xcrange | cut -f1 -d '-'`
    tmpb=`echo $xcrange | cut -f2 -d '-'`
    bcl=`expr $tmpa + $tmpb - 1`
    l=`expr $bcl + $pbcl`
    xc=1-"$l"
    xcst=1
    xcend=$l
    xmst=`expr $l + 1`
    tmpa=`echo $xmrange | cut -f1 -d '-'`
    tmpb=`echo $xmrange | cut -f2 -d '-'`
    ml=`expr $tmpb - $tmpa`
    xmend=`expr $xmst + $ml`
    xmr="$xmst"-"$xmend"
    xcr="$xcst"-"$xcend"
  else
    xmr=$xmrange
    xcr=$xcrange
  fi

	case "$whichStage" in
		"filtering")
		if [[ "$isstrt" == "yes" ]] ; then
@@ -286,11 +320,11 @@ if [[ "$isslurm" == "yes" ]] ; then
		elif [[ "$isindrops" == "yes" ]] ; then
			bash $zumisdir/zUMIs-filtering-inDrops.sh $cdnaread $bcread $libread $bcread2 $sname $outdir $xmrange $cbasequal $mbasequal $molbcbase $cellbcbase $threads $zumisdir $pigzexc
		else
			bash $zumisdir/zUMIs-filtering.sh $bcread $cdnaread $sname $outdir $xcrange $xmrange $cbasequal $mbasequal $molbcbase $cellbcbase $threads $zumisdir $pigzexc
			bash $zumisdir/zUMIs-filtering.sh $bcread $cdnaread $sname $outdir $xcrange $xmrange $cbasequal $mbasequal $molbcbase $cellbcbase $threads $zumisdir $pigzexc $pbcfastq $pbcrange
		fi
			bash $zumisdir/zUMIs-mapping.sh $sname $outdir $genomedir $gtf $threads $readlen "$starparams" $starexc $samtoolsexc $xmrange $BaseTrim $isstrt
			bash $zumisdir/zUMIs-prepCounting.sh $sname $outdir $threads $samtoolsexc
			bash $zumisdir/zUMIs-counting.sh $sname $outdir $barcodes $threads $gtf $strandedness $xcrange $xmrange $subsampling $zumisdir $isStats $whichStage $isstrt $bcread2 $xcrange2 $nreads $Rexc $ham $XCbin
			bash $zumisdir/zUMIs-counting.sh $sname $outdir $barcodes $threads $gtf $strandedness $xcr $xmr $subsampling $zumisdir $isStats $whichStage $isstrt $bcread2 $xcrange2 $nreads $Rexc $ham $XCbin
			bash $zumisdir/zUMIs-cleaning.sh $sname $outdir
			;;
		"mapping")
@@ -309,7 +343,7 @@ if [[ "$isslurm" == "yes" ]] ; then
		fi
			bash $zumisdir/zUMIs-mapping.sh $sname $outdir $genomedir $gtf $threads $readlen "$starparams" $starexc $samtoolsexc $xmrange $BaseTrim $isstrt
			bash $zumisdir/zUMIs-prepCounting.sh $sname $outdir $threads $samtoolsexc
			bash $zumisdir/zUMIs-counting.sh $sname $outdir $barcodes $threads $gtf $strandedness $xcrange $xmrange $subsampling $zumisdir $isStats $whichStage $isstrt $bcread2 $xcrange2 $nreads $Rexc $ham $XCbin
			bash $zumisdir/zUMIs-counting.sh $sname $outdir $barcodes $threads $gtf $strandedness $xcr $xmr $subsampling $zumisdir $isStats $whichStage $isstrt $bcread2 $xcrange2 $nreads $Rexc $ham $XCbin
			bash $zumisdir/zUMIs-cleaning.sh $sname $outdir
			;;
		"counting")
@@ -331,14 +365,14 @@ if [[ "$isslurm" == "yes" ]] ; then
			bash $zumisdir/zUMIs-prepCounting.sh $sname $outdir $threads $samtoolsexc
		fi

			bash $zumisdir/zUMIs-counting.sh $sname $outdir $barcodes $threads $gtf $strandedness $xcrange $xmrange $subsampling $zumisdir $isStats $whichStage $isstrt $bcread2 $xcrange2 $nreads $Rexc $ham $XCbin
			bash $zumisdir/zUMIs-counting.sh $sname $outdir $barcodes $threads $gtf $strandedness $xcr $xmr $subsampling $zumisdir $isStats $whichStage $isstrt $bcread2 $xcrange2 $nreads $Rexc $ham $XCbin
			bash $zumisdir/zUMIs-cleaning.sh $sname $outdir
			;;
		"summarising")
			bash $zumisdir/zUMIs-counting.sh $sname $outdir $barcodes $threads $gtf $strandedness $xcrange $xmrange $subsampling $zumisdir $isStats $whichStage $isstrt $bcread2 $xcrange2 $nreads $Rexc $ham $XCbin
			bash $zumisdir/zUMIs-counting.sh $sname $outdir $barcodes $threads $gtf $strandedness $xcr $xmr $subsampling $zumisdir $isStats $whichStage $isstrt $bcread2 $xcrange2 $nreads $Rexc $ham $XCbin
			bash $zumisdir/zUMIs-cleaning.sh $sname $outdir
			;;
	esac
else
	bash $zumisdir/zUMIs-noslurm.sh $cdnaread $bcread $sname $outdir $xcrange $xmrange $cbasequal $mbasequal $molbcbase $cellbcbase $threads $genomedir $gtf $readlen "$starparams" $starexc $barcodes $strandedness $subsampling $zumisdir $samtoolsexc $isStats $whichStage $bcread2 $BaseTrim $isstrt $xcrange2 $CustomMappedBAM $isCustomFASTQ $nreads $isindrops $libread $pigzexc $Rexc $ham $XCbin
	bash $zumisdir/zUMIs-noslurm.sh $cdnaread $bcread $sname $outdir $xcrange $xmrange $cbasequal $mbasequal $molbcbase $cellbcbase $threads $genomedir $gtf $readlen "$starparams" $starexc $barcodes $strandedness $subsampling $zumisdir $samtoolsexc $isStats $whichStage $bcread2 $BaseTrim $isstrt $xcrange2 $CustomMappedBAM $isCustomFASTQ $nreads $isindrops $libread $pigzexc $Rexc $ham $XCbin $pbcfastq $pbcrange
fi
Loading