Unverified Commit 54b15d2f authored by hy298's avatar hy298 Committed by GitHub
Browse files

Delete RNA-seq directory

parent 0d9d9428
Loading
Loading
Loading
Loading

RNA-seq/README.md

deleted100644 → 0
+0 −6
Original line number Diff line number Diff line
# RNA-seq data mapping anlaysis

run_RNASeq_jobs.sh
process_RNASeq_data.sh
runRNA_STAR_paired.pl

RNA-seq/process_RNASeq_data.sh

deleted100644 → 0
+0 −14
Original line number Diff line number Diff line
#!/bin/bash
GSE=$1
GSM=$2
file=$3
genome=$4
genomeDir=$5
annotationFile=$6


mkdir -p $GSE/$GSM
mv $file".r_1.fastq" $GSE/$GSM/$GSM".r_1.fastq"	
mv $file".r_2.fastq" $GSE/$GSM/$GSM".r_2.fastq"
cd $GSE/$GSM
runRNA_STAR_paired.pl  $GSM".r_1.fastq" $GSM".r_2.fastq" $GSM $genome $genomeDir  $annotationFile  "y"

RNA-seq/runRNA_STAR_paired.pl

deleted100644 → 0
+0 −204
Original line number Diff line number Diff line
#! /usr/bin/perl -w

##############################################################################
# Script to QC the raw data, aligned the paired reads, convert the aligned 
# reads in SAM format to bedgraph and bigwig and finally output HT-seq counts
# Step 1: Process each fastq file (fastqc) and collect joint list of adapters
# Step 2: Run STAR on trimmed files
# Step 3: Convert sam to bigwig 
# Step  4: Sort uniquely mappable sam file and generate raw counts
##############################################################################

use strict;
use File::Basename;

if (@ARGV != 7){ print("\n
Please provide: 
<fastq file 1> 
<fastq file 2>
<sample name> 
<genome mm10/hg19> 
<path to STAR genome folder >
<annoataion File>
<exons y/n>
\n\n"); exit;}

my @paired_files=();

my $file1 = $ARGV[0];
chomp $file1;

push(@paired_files, $file1);

my $file2 = $ARGV[1];
chomp $file2;

push(@paired_files, $file2);

my $file1_basename = basename($file1,".fastq");
my $file2_basename = basename($file2,".fastq");

my $GSM = $ARGV[2];
chomp $GSM;

my $genome = $ARGV[3];
chomp $genome;

my $genomeDir="/serenity/data/reference-genomes/".$ARGV[4];
chomp $genomeDir;

my $annotationFile = $genomeDir."/".$ARGV[5];
chomp $annotationFile;

my $exons = $ARGV[6];
chomp $exons;

#unless ($genome eq "hg19"){print "\nCurrently only set up to run hg19... Exiting.\n"; exit;}
#unless ($genome eq "mm10" || $genome eq "hg19"){print "\n\n'Genome not recognised. Please enter 'mm10' or 'hg19' \n"; exit;}
unless ($exons eq "y" || $exons eq "n"){print "\n\n'exons_only' choice not recognised. Please enter only either 'y' or 'n' (lowercase).\n"; exit;}


my $total_overrep = 0;

## Step 1: Process each fastq file (fastqc) and collect joint list of adapters
foreach my $file (@paired_files)
{
### run fastqc on file1 and file2
print "\nProcessing $file...\n";
print "\nRunning fastqc...\n\n";
my $cmd = "fastqc --quiet -f fastq $file";
system($cmd);
### check if overrepresented sequences found
my $basename = basename($file,".fastq");
my $fastqc_data = "./".$basename."_fastqc/fastqc_data.txt";
my $overrep = `grep \">>Overrepresented sequences\" $fastqc_data`;
$cmd= "grep \">>Overrepresented sequences\" $fastqc_data > grep_test.txt";
system($cmd);
if ($overrep =~ m/fail/)
{
	$total_overrep++;
	### retrieve overrep seqs from report
	my $adapters_out = $basename."_adapters.txt";
	my $cmd= "grep -i \"adapter\" $fastqc_data > $adapters_out";
	system($cmd);
	
	##retrieve illumina quality encoding info from report
	my $illumina_out = $basename."_illumina.txt";
	$cmd= "grep -i \"Encoding\" $fastqc_data > $illumina_out";
	system($cmd);
} # close for if 
}#close foreach

if ($total_overrep > 0)
{
### Combine adapters from both files, sort and remove duplicates
my $cmd = "cat *_adapters.txt | sort | uniq > all_adapters.txt";
system($cmd);
### Combine illumina encoding, sort, remove duplicates, extract score and set Phred
my $illumina = `cat *_illumina.txt | sort | uniq`;
chomp $illumina;
my $phred = 0;
my $score = 0;
if ($illumina =~ m/\s+(\d+\.\d+)\s*$/)
{
	$score = $1;
	chomp $score;
} # close for if
if ($score == 1.5)
{
	$phred = "phred64";
}
elsif ($score == 1.9)
{
	$phred = "phred33";
}
else
{
print "\n\nIllumina quality encoding not recognised!\n\n"; 
exit;
} # close for if

### check if overrep seqs contains adapters specifically
open (TEMP, "all_adapters.txt");
my @adapters = <TEMP>;
close(TEMP);
my $adaps_found = 0;
foreach my $adapter_line (@adapters)
{
	if ($adapter_line =~ m/adapter/i) { $adaps_found = 1; last;}
}
### if adapters found, iterate trimming for all adapters found
if ($adaps_found == 1)
{
print "\nAdapters found...\n";
### make copies of the original untrimmed fastq files before trimming
my $cmd= "mkdir originals";
system($cmd);
$cmd= "cp $file1 ./originals/";
system($cmd);
$cmd= "cp $file2 ./originals/";
system($cmd);
	my $j = 0;	
	foreach my $adapter_line (@adapters)
	{
		$j++;
		print "\nTrimming: $adapter_line\n\n";
		my $length = 0;
		my $adapter = "";
		if ($adapter_line =~ m/^(\w+)\t\d+/)
		{
			$adapter = $1;
			chomp $adapter;
			$length = length($adapter);
		}

	my $cmd= "trim_galore --paired --$phred --quality 20 -a $adapter --stringency $length $file1 $file2";
	print "\n$cmd\n\n";
	system($cmd);
	
	my $trimmed1 = $file1_basename."_val_1.fastq";
	$cmd= "mv $trimmed1 $file1";
	system($cmd);
	
	my $trimmed2 = $file2_basename."_val_2.fastq";
	$cmd= "mv $trimmed2 $file2";
	system($cmd);
	
	my $trimmingLog = $file1."_trimming_report.txt";
	my $newTrimmingLog = $file1."_trimming_report".$j.".txt";
	$cmd= "mv $trimmingLog $newTrimmingLog";
	system($cmd);
	
	$trimmingLog = $file2."_trimming_report.txt";
	$newTrimmingLog = $file2."_trimming_report".$j.".txt";
	$cmd= "mv $trimmingLog $newTrimmingLog";
	system($cmd);
	
	}#close foreach adapater
}#close if adaps found
}#close if overrep seqs found ($total_overrep)


## Step 2: Run STAR and extract uniquely mappable reads 
my $cmd= "mkdir STAR_out";
system($cmd);
chdir("STAR_out");

### Run STAR
$cmd= "/serenity/software/STAR_2.4.0k/bin/Linux_x86_64/STAR --genomeDir $genomeDir --readFilesIn ../$file1 ../$file2 --runThreadN 5 --outFilterIntronMotifs None --outFilterMismatchNmax 2 --outFileNamePrefix $GSM.";
system($cmd);

### extract only uniquely mappable reads (NH:i:1 and MAPQ=255) and sam header, remove flag 'nM:i:n' which is unrecognised by sam2bed and remove white space from the ends of each line:
print "\n\nExtracting uniquely mappable reads from sam...\n";
$cmd= "grep \'NH:i:1\\s*\\|NH:i:1\$\\|^\@\' $GSM.Aligned.out.sam | awk \'{if(\$5 == 255) print \$0; else if (\$1 ~ /^\@/) print \$0;}\' | sed \'s/nM:i:[0-9]*//g\' | sed \'s/\\s*\$//\' > $GSM.uniquely_mappable.sam";
system($cmd);

## Step  3: Generate raw counts using HTSeq
### run HTseq
print "\n\nRunning HTSeq...\n";
$cmd= "python -m HTSeq.scripts.count --stranded no --type exon -m intersection-nonempty $GSM.uniquely_mappable.sam $annotationFile --samout $GSM.htseq.samout > $GSM.htseq.counts";
#$cmd= "/usr/bin/python -m HTSeq.scripts.count --stranded no --type exon -m intersection-nonempty $GSM.uniquely_mappable.sam $annotationFile > $GSM.htseq.counts";
system($cmd);
print "\n\nhtseq.counts ready for running DE-Seq! :-)\n\n";

exit;

RNA-seq/run_RNASeq_jobs.sh

deleted100644 → 0
+0 −15
Original line number Diff line number Diff line
#!/bin/bash
# Wildtype
process_RNASeq_data.sh WT.EXPR WT.EXPR.R1 WT.EXPR.R1 STAR-GENOMES-mm10.gencode.vM7.comprehensive gencode.vM7.comprehensive.annotation.gtf  mm10
process_RNASeq_data.sh WT.EXPR WT.EXPR.R2 WT.EXPR.R2 STAR-GENOMES-mm10.gencode.vM7.comprehensive gencode.vM7.comprehensive.annotation.gtf  mm10
# Double Mutant
process_RNASeq_data.sh DM.EXPR DM.EXPR.R1 DM.EXPR.R1 STAR-GENOMES-mm10.gencode.vM7.comprehensive gencode.vM7.comprehensive.annotation.gtf  mm10
process_RNASeq_data.sh DM.EXPR DM.EXPR.R2 DM.EXPR.R2 STAR-GENOMES-mm10.gencode.vM7.comprehensive gencode.vM7.comprehensive.annotation.gtf  mm10
# NPM1 mutation
process_RNASeq_data.sh NPM1.EXPR NPM1.EXPR.R1 NPM1.EXPR.R1 STAR-GENOMES-mm10.gencode.vM7.comprehensive gencode.vM7.comprehensive.annotation.gtf  mm10
process_RNASeq_data.sh NPM1.EXPR NPM1.EXPR.R2 NPM1.EXPR.R2 STAR-GENOMES-mm10.gencode.vM7.comprehensive gencode.vM7.comprehensive.annotation.gtf  mm10
# FLT3 mutation
process_RNASeq_data.sh FLT3.EXPR FLT3.EXPR.R1 FLT3.EXPR.R1 STAR-GENOMES-mm10.gencode.vM7.comprehensive gencode.vM7.comprehensive.annotation.gtf  mm10
process_RNASeq_data.sh FLT3.EXPR FLT3.EXPR.R2 FLT3.EXPR.R2 STAR-GENOMES-mm10.gencode.vM7.comprehensive gencode.vM7.comprehensive.annotation.gtf  mm10