Commit 10fab53f authored by kai.b's avatar kai.b
Browse files

bugfix for SmartSeq3 adjustments

parent 568b78f8
Loading
Loading
Loading
Loading
+1 −25
Original line number Diff line number Diff line
@@ -3343,7 +3343,7 @@ else
            echo "  ...remove internal for ${technology} by matching tag sequence for UMI reads"
            # filter UMI reads by matching tag sequence ATTGCGCAATG (bases 1-11 of R1) and remove as adapters 
            perl ${FILTERSMARTSEQREADUMI} --r1 ${convR1} --r2 ${convR2} --i1 ${convI1} --i2 ${convI2} --tag 'ATTGCGCAATG' --out_dir ${crIN}
            echo "  ...trim tag sequence from R1"
            echo "  ...parsing reads with tag sequence and swoping 10x TSO for R1"
            
            # returns R1 with tag sequence removed (left trim) starting with 8pbp UMI and corresponding reads for I1, I2, and R2
            mv $crIN/parsed_R1.fastq ${convR1}
@@ -3358,30 +3358,6 @@ else
            #returns a combined R1 file with I1-I2-R1 concatenated (I1 and I2 are R1 barcode)
            mv $crIN/Concatenated_File.fastq ${convR1}
            
            #convert TSO to expected length for 10x 5' (TSS in R1 from base 39)
            echo " handling $convFile ..."
            tsoS="TTTCTTATATGGG"
            tsoQ="IIIIIIIIIIIII"
            #Add 10x TSO characters to the end of the sequence
            cmd=$(echo 'sed -E "2~4s/^(.{'$barcodelength'})(.{'${umilength}'})(.{'3'})/\1\2'$tsoS'/" '$convFile' > '${crIN}'/.temp')
            if [[ $verbose ]]; then
                echo technology $technology
                echo barcode: $barcodelength
                echo umi: $umilength
                echo $cmd
            fi
            # run command with barcode and umi length, e.g.,: sed -E "2~4s/^(.{16})(.{8})(.{'3'})(.*)/\1\2$tsoS\4/"  $convFile > ${crIN}/.temp
            eval $cmd
            mv ${crIN}/.temp $convFile
            #Add n characters to the end of the quality
            cmd=$(echo 'sed -E "4~4s/^(.{'$barcodelength'})(.{'${umilength}'})(.{'3'})/\1\2'$tsoQ'/" '$convFile' > '${crIN}'/.temp')
            # run command with barcode and umi length, e.g.,: sed -E "4~4s/^(.{16})(.{8})(.{'3'})(.*)/\1\2$tsoQ\4/"  $convFile > ${crIN}/.temp
            if [[ $verbose ]]; then
                echo $cmd
            fi
            eval $cmd
            #returns an R file with the TSO replaced with the 13 bp 10x Genomics sequence
            mv ${crIN}/.temp $convFile
            echo "  ${convFile} adjusted"
        done
    fi
+74 −38
Original line number Diff line number Diff line
@@ -12,14 +12,20 @@ use warnings;
use Getopt::Long;

#####SCRIPT DESCRIPTION#####
#Script "FilterSmartSeqReadUMI.pl" given a set of I1, I2, R1, and R2, trims and parses R1 and only keep the corresponding reads in the other files.
#Script "FilterSmartSeqReadUMI.pl" given a set of I1, I2, R1, R2, tag, will generate
#   1. R1 file parsed and converted based on tag and TSO sequence respectively
#   2. R2, I1, and I2 files with reads corresponding to R1
###########



#####Options#####
#SmartSeq3 tag
my $tagseq = "ATTGCGCAATG";
#target tag
my $tag = "";

#target tso
my $tso_10x = "TTTCTTATATGGG";
my $tso_10x_q = "IIIIIIIIIIIII";

#setting the default values.
my $i1 = "";
@@ -34,11 +40,11 @@ my $out_dir = "";

#making the options into external arguments.
GetOptions (
	'tag=s' => \$tag,
	'i1=s' => \$i1,
	'i2=s' => \$i2,
	'r1=s' => \$r1,
	'r2=s' => \$r2,
        'tag=s' => \$tagseq,
	'out_dir=s' => \$out_dir
	);

@@ -55,6 +61,9 @@ elsif (!$r1) {
elsif (!$r2) {
	die "USAGE: option --r2 <R2.fastq> is required.\n";
}
elsif (!$tag) {
	die "USAGE: option --tag <TARGET_TAG_SEQUENCE> is required.\n";
}
elsif (!$out_dir) {
	die "USAGE: option --out_dir <OUTPUT_DIRECTORY> is required.\n";
}
@@ -68,15 +77,20 @@ $r1_out = $out_dir."/".$r1_out;
$r1_out =~ s/\/\//\//;
$r2_out = $out_dir."/".$r2_out;
$r2_out =~ s/\/\//\//;

#set technology
my $technology = "";
if ($tag eq "AAGCAGTGGTATCAACGCAGAGTAC") {
	$technology = "SmartSeq2";
}
elsif ($tag eq "ATTGCGCAATG") {
	$technology = "SmartSeq3";
}
##########



#####MAIN#####
#get list of reads to keep
my @keepreads = map { ($_ + 2) / 4 } split(/\n/, `grep $tagseq $r1 -n | cut -f1 -d\:`);
my %keepreads = map { $_ => 1 } @keepreads;

#parse reads to keep
my $readcount = 0;
open (I1, "<", $i1) or die "cannot open $i1.\n";
@@ -108,20 +122,43 @@ while (my $line = <R1>) {
	my $r2_plus = <R2>;
	my $r2_q = <R2>;
	
	if (!exists $keepreads{$readcount}) {
	#remove reads without a tag
	if ($r1_seq !~ m/$tag/) {
		next;
	}
	else {
		#trim r1 data
		my @trim = split (/$tagseq/, $r1_seq);
	
	#trim r1 data by tag
	my $r1_trim_seq = $r1_seq;
	my @trim = split (/$tag/, $r1_trim_seq);
	shift @trim;
		my $new_r1_seq = join ("$tagseq", @trim);
		my $new_r1_q = $r1_q;
		chomp $new_r1_q;
		$new_r1_q = reverse $new_r1_q;
		$new_r1_q = substr ($new_r1_q, 0, length($new_r1_seq) - 1);
		$new_r1_q = reverse $new_r1_q;
		$new_r1_q = $new_r1_q."\n";
	$r1_trim_seq = join ("$tag", @trim);
	$r1_trim_seq = $r1_trim_seq;
	my $r1_trim_q = $r1_q;
	chomp $r1_trim_q;
	$r1_trim_q = reverse $r1_trim_q;
	$r1_trim_q = substr ($r1_trim_q, 0, length($r1_trim_seq) - 1);
	$r1_trim_q = reverse $r1_trim_q;
	$r1_trim_q = $r1_trim_q."\n";
	
	#remove reads too short after trimming
	if ($technology eq "SmartSeq2" && length($r1_trim_seq) < 3) {
		next;
	}
	elsif ($technology eq "SmartSeq3" && length($r1_trim_seq) < 11) {
		next;
	}
	
	#replacing tso
	my $r1_trim_swop_seq;
	my $r1_trim_swop_q;
	if ($technology eq "SmartSeq2") {
		$r1_trim_swop_seq = $tso_10x.(substr ($r1_trim_seq, 3));
		$r1_trim_swop_q = $tso_10x_q.(substr ($r1_trim_q, 3));
	}
	elsif ($technology eq "SmartSeq3") {
		$r1_trim_swop_seq = (substr ($r1_trim_seq, 0, 8)).$tso_10x.(substr ($r1_trim_seq, 11));
		$r1_trim_swop_q = (substr ($r1_trim_q, 0, 8)).$tso_10x_q.(substr ($r1_trim_q, 11));
	}
	
	#print out data
	print I1OUT "$i1_head";
@@ -133,15 +170,14 @@ while (my $line = <R1>) {
	print I2OUT "$i2_plus";
	print I2OUT "$i2_q";
	print R1OUT "$r1_head";
		print R1OUT "$new_r1_seq";
	print R1OUT "$r1_trim_swop_seq";
	print R1OUT "$r1_plus";
		print R1OUT "$new_r1_q";
	print R1OUT "$r1_trim_swop_q";
	print R2OUT "$r2_head";
	print R2OUT "$r2_seq";
	print R2OUT "$r2_plus";
	print R2OUT "$r2_q";
}
}
close (I1);
close (I2);
close (R1);