Commit 485fd1a2 authored by TomKellyGenetics's avatar TomKellyGenetics
Browse files

add support for STRT-Seq with 5' scRNA (non UMI)

parent ff3918e0
Loading
Loading
Loading
Loading
+49 −1
Original line number Diff line number Diff line
@@ -1611,6 +1611,10 @@ else
             fi
    elif [[ "$technology" == "smartseq3" ]]; then
        barcodefile=${whitelistdir}/SmartSeq3_barcode.txt
    elif [[ "$technology" == "strt-seq" ]]; then
         barcodefile=${whitelistdir}/STRTSeq_barcode.txt
    elif [[ "$technology" == "strt-seq-c1" ]]; then
         barcodefile=${whitelistdir}/STRTSeqC1_barcode.txt
    else
        echo "***WARNING: whitelist for ${technology} will be all possible combinations of ${minlength}bp. valid barcode will be 100% as a result***"
        barcodelength=${minlength}
@@ -2550,6 +2554,50 @@ else
        done
    fi

    #STRT-Seq
    if [[ "$technology" == "strt-seq" ]]; then
        echo "  ...processsing for ${technology}"
        if [[ $verbose ]]; then
            echo "Note: STRT-Seq does not contain UMIs"
        fi
        for convFile in "${convFiles[@]}"; do
            read=$convFile
            convR1=$read

            # add mock UMI (count reads instead of UMI) barcodelength=6, umi_default=10
            echo "  ...generate mock UMI for compatibility"
            perl sub/AddMockUMI.pl --fastq=${convR1} --out_dir $crIN --head_length=$barcodelength --umi_length=$umi_default
            umilength=$umi_default
            umiadjust=0
            chemistry="SC5P-PE"
            #returns a combined R1 file with barcode and mock UMI
            ## 6 bp barcode, 10 bp UMI, GGG for TSO
            mv $crIN/mock_UMI.fastq ${convR1}

            #convert TSO to expected length for 10x 5' (TSS in R1 from base 39)
            echo " handling $convFile ..."
            tsoS="TTTCTTATATGGG"
            tsoQ="IIIIIIIIIIIII"
            #Add 10x TSO characters to the end of the sequence
            cmd=$(echo 'sed -E "2~4s/(.{'$barcodelength'})(.{'${umilength}'})(.{3})/\1\2'$tsoS'/" '$convFile' > '${crIN}'/.temp')
            if [[ $verbose ]]; then
                echo technology $technology
                echo barcode: $barcodelength
                echo umi: $umilength
                echo $cmd
            fi
            # run command with barcode and umi length, e.g.,: sed -E "2~4s/(.{16})(.{8})(.{3})(.*)/\1\2$tsoS\4/"  $convFile > ${crIN}/.temp
            eval $cmd
            mv ${crIN}/.temp $convFile
            #Add n characters to the end of the quality
            cmd=$(echo 'sed -E "4~4s/(.{'$barcodelength'})(.{'${umilength}'})(.{3})/\1\2'$tsoQ'/" '$convFile' > '${crIN}'/.temp')
            # run command with barcode and umi length, e.g.,: sed -E "4~4s/(.{16})(.{8})(.{3})(.*)/\1\2$tsoQ\4/"  $convFile > ${crIN}/.temp
            eval $cmd
            mv ${crIN}/.temp $convFile
            echo "  ${convFile} adjusted"
       done
    fi

    #SureCell: remove adapter and correct phase blocks
    ## https://github.com/Hoohm/dropSeqPipe/issues/42
    if [[ "$technology" == "surecell" ]]; then

sub/AddMockUMI.pl

0 → 100755
+127 −0
Original line number Diff line number Diff line
#!/usr/bin/perl

#########################################
#                                       #
#     Written by Kai Battenberg         #
#     Plant Symbiosis Research Team     #
#                                       #
#########################################

use strict;
use warnings;
use Getopt::Long;

#####SCRIPT DESCRIPTION#####
#Script "add_mock_UMI.pl" given a FASTQ file, starting position, and desired UMI length, generates a new FASTQ file with a mock UMI inserted.
###########



######Options#####
my $fastq_in = "";
my $fastq_out = "mock_UMI.fastq";
my $head_length = ""; #number of characters to have before insertion of mock UMI.
my $umi_length = ""; #length of desired UMI
my $out_dir = ".";
##########



#####Checking Options#####
#making the options into external arguments.
GetOptions (
	'fastq=s' => \$fastq_in,
	'head_length=s' => \$head_length,
	'umi_length=s' => \$umi_length,
        'out_dir=s' => \$out_dir
	);

#checking for required options.
if (!$fastq_in) {
	die "USAGE: option --fastq <FASTQ FILE> is required.\n";
}
elsif (!$head_length) {
	die "USAGE: option --head_length <INTEGER> is required.\n";
}
elsif (!$umi_length) {
	die "USAGE: option --umi_length <INTEGER> is required.\n";
}
elsif (!$out_dir) {
        die "USAGE: option --out_dir <OUTPUT_DIRECTORY> is required.\n";
}

$fastq_out = $out_dir."/".$fastq_out;
$fastq_out =~ s/\/\//\//;
##########


#checking head length
if ( $head_length !~ /^-?\d+\.?\d*$/ ) {
	die "Error: option --head_length needs to be numeric\n";
}
elsif ( $head_length < 0 || $head_length !~ /^-?\d+$/ ) {
	die "Error: option --head_length needs to be an integer 0 or greater\n";
}

#checking umi length
if ( $umi_length !~ /^-?\d+\.?\d*$/ ) {
	die "Error: option --umi_length needs to be numeric\n";
}
elsif ( $head_length < 1 || $head_length !~ /^-?\d+$/ ) {
	die "Error: option --umi_length needs to be an integer 0 or greater\n";
}
##########



#####Main#####
my %nucleotide = (
	'00' => "A",
	'01' => "C",
	'10' => "G",
	'11' => "T"
);

my $insert_q = "I" x $umi_length;
my $read_count = -1;
open (IN, "<", $fastq_in) or die "cannot open $fastq_in.\n";
open (OUT, ">", $fastq_out) or die "cannot open $fastq_out.\n";
while (my $line = <IN>) {
	$read_count++;
	
	#get binary read count
	my $binary = sprintf ("%b", $read_count);
	$binary = (0 x (2 * $umi_length)).$binary;
	$binary = reverse(substr (reverse ($binary), 0, 2 * $umi_length));
	my @binary = ( $binary =~ m/../g );
	
	#get mock UMI
	my $mock_umi = "";
	foreach my $character (@binary) {
		$mock_umi = $mock_umi.$nucleotide{$character};
	}
	
	#get data from FASTQ file
	my $header = $line;
	chomp $header;
	my $seq = <IN>;
	chomp $seq;
	my $plus = <IN>;
	my $q = <IN>;
	chomp $q;
	
	#insert 
	my $seq_head = substr($seq, 0, $head_length);
	my $seq_tail = reverse(substr (reverse ($seq), 0, length($seq) - $head_length));
	my $q_head = substr($q, 0, $head_length);
	my $q_tail = reverse(substr (reverse ($q), 0, length($q) - $head_length));
	
	print OUT "$header\n";
	print OUT "$seq_head$mock_umi$seq_tail\n";
	print OUT "$plus";
	print OUT "$q_head$insert_q$q_tail\n";
}
close (IN);
close (OUT);
##########
__END__
+96 −0
Original line number Diff line number Diff line
TTTAGG
ATTCCA
GCTCAA
CATCCC
TTGGAC
CTGTGT
GGACAT
CAAAGT
AAGCGG
AATAAA
GAGGAG
GGTACA
AGCGAG
GTCGGT
ATTTGC
AGGACT
GCCCTC
TCGTAA
CCAGAC
TATGTA
ACAATA
ATGCTT
AGTTTA
CACAAG
ATCAAC
TAGTCG
TAGAGA
GTCCCG
TACTTC
AAAGTT
TAAGGG
GTTGCC
AAGTAC
GATCTT
TTAACT
GCGAAT
CCGCTA
TGAAGC
ATACAG
CTTCTG
GAGATC
CCGACG
CTCCAT
AAAACG
TAGCAT
TCGGGT
GTGGTA
CCTAGA
GGGTTT
ATGGCG
TTCATA
AACGCC
GGCTGC
GCTGTG
AGATGG
GTAATG
AGGGTC
ATCTCT
GCCTAG
TCAAAG
CATGAT
TGTGCG
GCAGGA
TCTACC
AGTCGT
CGTGGC
GCGTCC
GAACGC
ACTTAT
TGGATG
TATTGT
ACGTTG
GAATTA
CCATCT
TGATCA
CGTATT
CGGCAG
GACACT
TTCCGC
CTCGCA
GTATAC
TGTCAC
TGCGGA
ACGAGC
ACACCC
CGCTTG
TGCAAT
CAACAA
CTGAAA
AACCTA
ACCTGA
TCACTT
GGGCGA
CGCACC
CGAGTA
CCTTTC