Commit a256a937 authored by kai.b's avatar kai.b Committed by TomKellyGenetics
Browse files

script for parsing SmartSeq-3 FASTQ files

parent 8ac5891e
Loading
Loading
Loading
Loading

sub/parse_fastq.pl

0 → 100644
+156 −0
Original line number Diff line number Diff line
#!/usr/bin/perl

#########################################
#                                       #
#     Written by Kai Battenberg         #
#     Plant Symbiosis Research Team     #
#                                       #
#########################################

use strict;
use warnings;
use Getopt::Long;

#####SCRIPT DESCRIPTION#####
#Script "parse_fastq.pl" given a set of I1, I2, R1, and R2, trims and parses R1 and only keep the corresponding reads in the other files.
###########



#####Options#####
my $ss3i = "ATTGCGCAATG";

#setting the default values.
my $i1 = "";
my $i2 = "";
my $r1 = "";
my $r2 = "";
my $i1_out = "SmartSeq3_parsed_I1.fastq";
my $i2_out = "SmartSeq3_parsed_I2.fastq";
my $r1_out = "SmartSeq3_parsed_R1.fastq";
my $r2_out = "SmartSeq3_parsed_R2.fastq";
my $out_dir = "";

#making the options into external arguments.
GetOptions (
	'i1=s' => \$i1,
	'i2=s' => \$i2,
	'r1=s' => \$r1,
	'r2=s' => \$r2,
	'out_dir=s' => \$out_dir
	);

#Checking options
if (!$i1) {
	die "USAGE: option --i1 <I1.fastq> is required.\n";
}
elsif (!$i2) {
	die "USAGE: option --i2 <I2.fastq> is required.\n";
}
elsif (!$r1) {
	die "USAGE: option --r1 <R1.fastq> is required.\n";
}
elsif (!$r2) {
	die "USAGE: option --r2 <R2.fastq> is required.\n";
}
elsif (!$out_dir) {
	die "USAGE: option --out_dir <OUTPUT_DIRECTORY> is required.\n";
}

$i1_out = $out_dir."/"."SmartSeq3_parsed_I1.fastq";
$i1_out =~ s/\/\//\//;
$i2_out = $out_dir."/"."SmartSeq3_parsed_I2.fastq";
$i2_out =~ s/\/\//\//;
$r1_out = $out_dir."/"."SmartSeq3_parsed_R1.fastq";
$r1_out =~ s/\/\//\//;
$r2_out = $out_dir."/"."SmartSeq3_parsed_R2.fastq";
$r2_out =~ s/\/\//\//;
##########



#####MAIN#####
#get order of reads to keep
my @keepreads = map { ($_ + 2) / 4 } split(/\n/, `grep $ss3i $r1 -n | cut -f1 -d\:`);
my %keepreads = map { $_ => 1 } @keepreads;

#parse reads to keep
my $readcount = 0;
open (I1, "<", $i1) or die "cannot open $i1.\n";
open (I2, "<", $i2) or die "cannot open $i2.\n";
open (R1, "<", $r1) or die "cannot open $r1.\n";
open (R2, "<", $r2) or die "cannot open $r2.\n";
open (I1OUT, ">", $i1_out) or die "cannot open $i1_out.\n";
open (I2OUT, ">", $i2_out) or die "cannot open $i2_out.\n";
open (R1OUT, ">", $r1_out) or die "cannot open $r1_out.\n";
open (R2OUT, ">", $r2_out) or die "cannot open $r2_out.\n";
while (my $line = <R1>) {
	$readcount++;
	my $i1_head = <I1>;
	my $i1_seq = <I1>;
	my $i1_plus = <I1>;
	my $i1_q = <I1>;
	my $i2_head = <I2>;
	my $i2_seq = <I2>;
	my $i2_plus = <I2>;
	my $i2_q = <I2>;
	my $r1_head = $line;
	my $r1_seq = <R1>;
	my $r1_plus = <R1>;
	my $r1_q = <R1>;
	my $r2_head = <R2>;
	my $r2_seq = <R2>;
	my $r2_plus = <R2>;
	my $r2_q = <R2>;
	
	if (!exists $keepreads{$readcount}) {
		next;
	}
	else {
		#trim r1 data
		my @trim = split (/$ss3i/, $r1_seq);
		shift @trim;
		my $new_r1_seq = join ("$ss3i", @trim);
		my $new_r1_q = $r1_q;
		chomp $new_r1_q;
		$new_r1_q = reverse $new_r1_q;
		$new_r1_q = substr ($new_r1_q, 0, length($new_r1_seq) - 1);
		$new_r1_q = reverse $new_r1_q;
		$new_r1_q = $new_r1_q."\n";
		
		print I1OUT "$i1_head";
		print I1OUT "$i1_seq";
		print I1OUT "$i1_plus";
		print I1OUT "$i1_q";
		print I2OUT "$i2_head";
		print I2OUT "$i2_seq";
		print I2OUT "$i2_plus";
		print I2OUT "$i2_q";
		print R1OUT "$r1_head";
		print R1OUT "$new_r1_seq";
		print R1OUT "$r1_plus";
		print R1OUT "$new_r1_q";
		print R2OUT "$r2_head";
		print R2OUT "$r2_seq";
		print R2OUT "$r2_plus";
		print R2OUT "$r2_q";
	}
}
close (I1);
close (I2);
close (R1);
close (R2);
close (I1OUT);
close (I2OUT);
close (R1OUT);
close (R2OUT);









##########
 No newline at end of file