Commit 6dfa108f authored by TomKellyGenetics's avatar TomKellyGenetics
Browse files

update filtering for SmartSeq2 and add default barcodes

parent 03d38c00
Loading
Loading
Loading
Loading
+26 −20
Original line number Diff line number Diff line
@@ -212,9 +212,9 @@ Mandatory arguments to long options are mandatory for short options too.
                                  CEL-Seq (8 bp barcode, 4 bp UMI): celseq
                                  CEL-Seq2 (6 bp UMI, 6 bp barcode): celseq2
                                  Drop-Seq (12 bp barcode, 8 bp UMI): dropseq, nadia
                                  ICELL8 3′ scRNA version 2 (11 bp barcode, No UMI): icell8-non-umi, icell8-v2
                                  ICELL8 3′ scRNA version 3 (11 bp barcode, 14 bp UMI): icell8
                                  ICELL8 5′ scRNA with TCR OR kit (10bp barcode, NO bp UMI): icell8-5-prime
                                  ICELL8 3\′ scRNA version 2 (11 bp barcode, No UMI): icell8-non-umi, icell8-v2
                                  ICELL8 3\′ scRNA version 3 (11 bp barcode, 14 bp UMI): icell8
                                  ICELL8 5\′ scRNA with TCR OR kit (10bp barcode, NO bp UMI): icell8-5-prime
                                  ICELL8 full-length scRNA with Smart-Seq (16 bp barcode, No UMI): icell8-full-length
                                  inDrops version 1 (19 bp barcode, 6 bp UMI): indrops-v1, 1cellbio-v1
                                  inDrops version 2 (19 bp barcode, 6 bp UMI): indrops-v2, 1cellbio-v2
@@ -246,10 +246,10 @@ Mandatory arguments to long options are mandatory for short options too.
  -b,  --barcodefile FILE       Custom barcode list in plain text (with each line containing a barcode)
  
  -c,  --chemistry CHEM         Assay configuration, autodetection is not possible for converted files: 'SC3Pv2' (default), 'SC5P-PE', 'SC5P-R1', 'SC5P-R2', 'threeprime', or 'fiveprime'
                                    5′ scRNA-Seq ('SC5P-PE') is available only for 10x Genomics, ICELL8, SmartSeq, and STRT-Seq technologies.
                                    5\′ scRNA-Seq ('SC5P-PE') is available only for 10x Genomics, ICELL8, SmartSeq, and STRT-Seq technologies.
                                    Setting 'SC3Pv1' for 10x version 1 chemistry is recommended.
                                    All other technologies default to 3′ scRNA-Seq parameters. Only 10x Genomics, ICELL8, and SmartSeq2 allow choosing which to use.
                                    For SmartSeq2 this parameter detemines using full-length sequences or 5′ ends with internal reads removed.
                                    All other technologies default to 3\′ scRNA-Seq parameters. Only 10x Genomics, ICELL8, and SmartSeq2 allow choosing which to use.
                                    For SmartSeq2 this parameter detemines using full-length sequences or 5\′ ends with internal reads removed.
  
  -n,  --force-cells NUM        Force pipeline to use this number of cells, bypassing the cell detection algorithm.
  -j,  --jobmode MODE           Job manager to use. Valid options: 'local' (default), 'sge', 'lsf', or a .template file
@@ -1933,10 +1933,10 @@ else
             if [[ ! -f ${whitelistdir}/splitseq_barcode.txt ]]; then
                 echo "  generating combination of I1, I2, and RT barcodes ..."
             fi
    elif [[ "$technology" == "smartseq2" ]] || [[ "$technology" == "smartseq3" ]]; then
        barcodefile=${whitelistdir}/Illumina_Nextera_dual_barcodes.txt
        if [[ ! -f ${whitelistdir}/Illumina_Nextera_dual_barcodes.txt ]]; then
            echo "  generating combination of I1 and I2 barcodes ..."
     elif [[ "$technology" == "smartseq2" ]]; then
         barcodefile=${whitelistdir}/SmartSeq2_full_barcodes.txt
    elif [[ "$technology" == "smartseq3" ]]; then
        barcodefile=${whitelistdir}/SmartSeq3_full_barcodes.txt
        fi
    elif [[ "$technology" == "strt-seq" ]]; then
         barcodefile=${whitelistdir}/STRTSeq_barcode.txt
@@ -3449,11 +3449,17 @@ else
                perl ${FILTERSMARTSEQREADUMI} --r1 ${convR1} --r2 ${convR2} --i1 ${convI1} --i2 ${convI2} --tag 'AAGCAGTGGTATCAACGCAGAGTACGG' --out_dir ${crIN}
                echo "  ... trim tag sequence from R1"

                # returns R1 with tag sequence removed (left trim) starting with 8pbp UMI and corresponding reads for I1, I2, and R2
                # returns R1 with tag sequence removed (left trim) starting with a 13 bp TSO and corresponding reads for I1, I2, and R2
                mv $crIN/parsed_R1.fastq ${convR1}
                mv $crIN/parsed_R2.fastq ${convR2}
                mv $crIN/parsed_I1.fastq ${convI1}
                mv $crIN/parsed_I2.fastq ${convI2}
                if [[ $verbose ]]; then
                    cp ${convR1} $crIN/parsed_R1.fastq
                    cp ${convR2} $crIN/parsed_R2.fastq
                    cp ${convI1} $crIN/parsed_I1.fastq
                    cp ${convI2} $crIN/parsed_I2.fastq
                fi
            else
                #if [[ ${chemistry} == "SC3Pv2" ]] || [[ ${chemistry} == "SC5P-PE" ]];
                # remove tag sequence adapter (first occurence only)
@@ -3477,6 +3483,7 @@ else
            mv $crIN/Concatenated_File.fastq ${convR1}
            
            if [[ $nonUMI == "true" ]]; then
                echo "  ... add mock UMI to count reads for to R1 files"
                #add mock UMI (count reads instead of UMI) barcodelength=16, umi_default=10
                perl ${ADDMOCKUMI} --fastq ${convR1} --out_dir ${crIN} --head_length ${barcodelength} --umi_length ${umi_default}
                umilength=${umi_default}
@@ -3487,13 +3494,12 @@ else
                mv $crIN/mock_UMI.fastq ${convR1}
            fi

            # skip adding TSO for 3' chemistry
            if [[ ${chemistry} != "SC3Pv2" ]] && [[ ${chemistry} != "SC3Pv3" ]] && [[ ${chemistry} != "auto" ]]; then
            # skip adding TSO for 3' chemistry (and 5' R1 which includes this in tag filtering)
            if [[ ${chemistry} != "SC3Pv2" ]] && [[ ${chemistry} != "SC3Pv3" ]] && [[ ${chemistry} != "SC5P-R1" ]] && [[ ${chemistry} != "auto" ]]; then
                #convert TSO to expected length for 10x 5' (TSS in R1 from base 39)
                echo " handling $convFile ..."
                tsoS="TTTCTTATATGGG"
                tsoQ="IIIIIIIIIIIII"

                #Add 10x TSO characters to the end of the sequence
                cmd=$(echo 'sed -E "2~4s/(.{'$barcodelength'})(.{'${umilength}'})/\1\2'$tsoS'/" '$convFile' > '${crIN}'/.temp')

@@ -3534,16 +3540,16 @@ else
            echo "  ... parsing R1 reads with tag sequence and inserting 10x TSO"
            perl ${FILTERSMARTSEQREADUMI} --r1 ${convR1} --r2 ${convR2} --i1 ${convI1} --i2 ${convI2} --tag 'ATTGCGCAATG' --out_dir ${crIN}
             
            #returns R1 with tag sequence removed (left trim) starting with 8pbp UMI and corresponding reads for I1, I2, and R2
            #returns R1 with tag sequence removed (left trim) starting with 8bp UMI and 13 bp TSO and corresponding reads for I1, I2, and R2
            mv $crIN/parsed_R1.fastq ${convR1}
            mv $crIN/parsed_R2.fastq ${convR2}
            mv $crIN/parsed_I1.fastq ${convI1}
            mv $crIN/parsed_I2.fastq ${convI2}
            if [[ $verbose ]]; then
                cp ${convR1} $crIN/parsed_R1.fastq
                cp ${convR1} $crIN/parsed_R1.fastq
                cp ${convR1} $crIN/parsed_R1.fastq
                cp ${convR1} $crIN/parsed_R1.fastq
                cp ${convR2} $crIN/parsed_R2.fastq
                cp ${convI1} $crIN/parsed_I1.fastq
                cp ${convI2} $crIN/parsed_I2.fastq
            fi
            
            #concatenate barcocdes from dual indexes to R1 as barcode (bases 1-16)
+4 −6
Original line number Diff line number Diff line
@@ -80,7 +80,7 @@ $r2_out =~ s/\/\//\//;

#set technology
my $technology = "";
if ($tag eq "AAGCAGTGGTATCAACGCAGAGTAC") {
if ($tag eq "AAGCAGTGGTATCAACGCAGAGTACGG") {
	$technology = "SmartSeq2";
}
elsif ($tag eq "ATTGCGCAATG") {
@@ -127,9 +127,7 @@ while (my $line = <R1>) {
	#trim r1 data by tag
	my $r1_trim_seq = $r1_seq;
	chomp $r1_trim_seq;
	my @trim = split (/$tag/, $r1_trim_seq);
	shift @trim;
	$r1_trim_seq = join ("$tag", @trim);
	$r1_trim_seq = substr ($r1_trim_seq, length($tag) + 1);
	my $r1_trim_q = $r1_q;
	chomp $r1_trim_q;
	$r1_trim_q = reverse $r1_trim_q;
@@ -148,8 +146,8 @@ while (my $line = <R1>) {
	my $r1_trim_swop_seq;
	my $r1_trim_swop_q;
	if ($technology eq "SmartSeq2") {
		$r1_trim_swop_seq = substr ($r1_trim_seq, 3);
		$r1_trim_swop_q = substr ($r1_trim_q, 3);
		$r1_trim_swop_seq = ($tso_seq.$r1_trim_seq);
		$r1_trim_swop_q = ($tso_q.$r1_trim_q);
	}
	elsif ($technology eq "SmartSeq3") {
		if (length($r1_trim_seq) > 11) {
+2711894 −0

File added.

Preview size limit exceeded, changes collapsed.

+19567 −0

File added.

Preview size limit exceeded, changes collapsed.