Unverified Commit 0c6c95b3 authored by hy298's avatar hy298 Committed by GitHub
Browse files

Delete ATAC-seq directory

parent c419e91b
Loading
Loading
Loading
Loading

ATAC-seq/README.md

deleted100644 → 0
+0 −20
Original line number Diff line number Diff line
# ATAC-seq data mapping and peak calling

## reads mapping
get_data.sh
run_getData.sh

## remove duplicates
remdup.sh
run_remdup.sh

## peakcalling
peakCalling.sh
run_peakCalling.sh
run_MACS.NoModel.Nolambda.pl





ATAC-seq/get_data.sh

deleted100644 → 0
+0 −432
Original line number Diff line number Diff line
#!/bin/bash
######################################
# 2013-05-13
# David Ruau, Department of Haematology
# Gottgens lab
# CIMR, University of Cambridge
# All right reserved.
# Licence: GPL (>=2)
######################################

######################################
# GLOBAL VARIABLES
######################################
# This is the location where the reference genome for bowtie2 are stored
resource='/serenity/data/reference-genomes/'
# and this for bowtie1, only considered to process human samples
resource_bw1='/serenity/data/reference-genomes/'
# The GNU parallel utility is not working on all platforms.
# set to no if you do not want to use parallel
use_parallel='yes'
# Sensitivity level for trimGalore
# Choose either WARN or FAIL (default)
trimLevel="FAIL"

usage(){
	echo 
    echo -e '\033[1m ChIP-seq /ATAC  pipeline\033[0m'
	echo 
	echo -e '\033[1mNAME\033[0m'
	echo '     get_data_local'
	echo 
	echo -e '\033[1mSYNOPSIS\033[0m'
	echo '     usage: get_data_local  [-g top_folder] [-m sub_folder] [-x ref_genome] [-i original fastq] [--merged] [--paired]  [--fasta]'
	echo 
    echo -e '\033[1mDESCRIPTION\033[0m'
	echo '     This utility will process raw sequence data from ChIP-seq/ATAC experiments in fastQ format.'
	echo '     When processing data, it will run FastQC quality check on each fastQ file,'
	echo '     trim the adpaters if present and run the aligner Bowtie2.'
	echo '     Human samples processed with the hg19 reference genome will be aligned with Bowtie 0.12.9.'
	echo '     It can process fasta file with --fasta options'
	echo '     You have to specify if the data are paired-end using the --paired option.'
	echo 
	echo '     The options are as follows:'
	echo 
    echo -e '     \033[1m-h, --help\033[0m'
    echo '             Show this message'
	echo 
    echo -e '     \033[1m-g \033[4mcommand\033[0m'
	echo '             The top folder to fastQ files'
	echo 
    echo -e '     \033[1m-m \033[4mcommand\033[0m'
    echo '             The sub folder to fastQ files'
	echo 
	echo -e '     \033[1m-i \033[4mcommand\033[0m'
	echo '             Sample Name'
	echo 
	echo -e '     \033[1m-x \033[4mcommand\033[0m'
	echo '             [optional] Reference genome: mm10, mm9, hg19, rn4 or rn5. If empty program will quit after fastQC'
	echo 
	echo -e '     \033[1m--paired\033[0m'
	echo '             [optional] Flag indicating that we are dealing with paired sequence reads.'
	echo
	echo -e '     \033[1m--fasta\033[0m'
	echo '             [optional] Flag indicating that we are processing fasta files. Fasta files need to be merged before hand using cat and placed in the GSE/GSM/ folder. The --fasta flag imply --local. It is used to specify that the "local" files are in fasta format and not in fastQ format.'
	echo 
	echo '             # To process local reads in fastQ or fasta formats.'
	echo '             # The files need to be in: top_folder/sub_folder/file.fastq or top_folder/sub_folder/file.fa'
	echo '             get_data_local  -g top_folder -m sub_folder -i original_fastq -x mm10'
	echo '             get_data_local  -g top_folder -m sub_folder -i original_fastq   -x mm10 --paired'		
	echo '             get_data_local --fasta -g top_folder -m sub_folder -i original_fastq  -x hg19'
	echo 
	echo  -e '\033[1mAUTHOR\033[0m'
	echo '     David Ruau <davidruau@gmail.com>'
	echo '     Department of Haematology, Gottgens lab'
	echo '     CIMR, University of Cambridge'
	echo '     Licence: GPL (>=2)'
	echo
	echo '02 Oct, 2013'
	exit 1
}

do_bowtie(){
	case "$GENOME" in
		mm10|mm9|hg19|rn4|rn5 )
			
			if [[ $GENOME == "hg19" ]] ; then
				echo '==> Running Bowtie 0.12.9...'
			else
				echo '==> Running Bowtie2...'
			fi
			
			REPORT="report_bowtie_"$GSM".txt"
			samFile=$GSM'_full.sam'
			
			if [[ $PAIRED == 0 ]]; then
				## ALIGNMENT TO REFERENCE GENOME STEP
				# For mouse dataset we use bowtie2 with the options described at
				# http://haemcode.stemcells.cam.ac.uk/doc_pipeline.php
				# -k 2 tell bowtie2 to keep reads with at most 2 match
				# -N 1 allow for one mismatch in the seed (20bp default).
				# the rest of the options is just for speed
				if [[ $FASTA == 1 ]];then
					echo '   - Processing FASTA sequence reads'
					if [[ $GENOME == "hg19" ]] ; then
						echo 'bowtie -m 2 -v 1 --best --strata --seed 0 '$resource_bw1$GENOME '-f *.fa -S '$samFile '2> '$REPORT
						bowtie -m 2 -v 1 --best --strata --seed 0 $resource_bw1$GENOME -f *.fa -S $samFile 2> $REPORT
					else
						echo 'bowtie2 -k 2 -N 1 --mm -x '$resource$GENOME '-f -U *.fa -S' $samFile '-p 20 2>' $REPORT
						bowtie2 -k 2 -N 1 --mm -x $resource$GENOME -f -U *.fa -S $samFile -p 20 2> $REPORT
					fi
										
				else
					echo '   - Processing '$GSM'.fastq'
					if [[ $GENOME == "hg19" ]] ; then
						echo 'bowtie -m 2 -v 1 --best --strata --seed 0'  $resource_bw1$GENOME '-q' $GSM'.fastq -S' $samFile '2>' $REPORT
						bowtie -m 2 -v 1 --best --strata --seed 0  $resource_bw1$GENOME -q $GSM.fastq -S $samFile 2> $REPORT
					else
						echo 'bowtie2 -k 2 -N 1 --mm -x' $resource$GENOME '-U' $GSM'.fastq' '-S' $samFile '-p 20 2> '$REPORT
						bowtie2 -k 2 -N 1 --mm -x $resource$GENOME -U $GSM'.fastq' -S $samFile -p 20  2> $REPORT
					fi
				fi
				## We remove the lines of the SAM file indicating the read matched multiple location. Keeping 
				# only uniquely mappable reads. The awk command filter for reads that actually map the forward 
				# strand (0) and the reverse strand (16) and also keep the header (@).
				grep -v XS:i: $samFile | awk '{if($2==0 || $2==16 || $1~/^@/) print $0}' > $GSM'.sam'
				
			else
				# IDENTIFY the fastQ files
				fileArray=(`ls *.fastq`)
				echo "   - Processing paired data ${fileArray[0]} and ${fileArray[1]}"
				if [[ $GENOME == "hg19" ]] ; then
					bowtie -m 2 -v 1 --best --strata --seed 0 $resource_bw1$GENOME -q -1 ${fileArray[0]} -2 ${fileArray[1]} -S $samFile 2> $REPORT
				else
					bowtie2 -k 2 -N 1 --mm --no-mixed --no-discordant  -x $resource$GENOME -1 ${fileArray[0]} -2 ${fileArray[1]} -S $samFile -p 20  2> $REPORT
				fi			
				## We remove the lines of the SAM file indicating the read matched multiple location. Keeping 
				# only uniquely mappable reads. The awk command filter for reads mate mapping uniquely the 
				# forward and reverse strand and also keep the headers.
				grep chr $samFile | grep -v "XS:i:" | awk '{if($2==147 || $2==83 || $2==99 || $2==163 || $2==81 || $2==97 || $2==145 || $2==161 || $1~/^@/) print $0}' > $GSM.sam
			fi
			rm $samFile
			if [[ $GENOME == "hg19" ]] ; then
				echo 
				echo -e "\033[1m\e[00;31m***\e[00m Human samples are processed with Bowtie 0.12.9!!! \e[00;31m***\e[00m\033[0m"
				echo 
				echo -e "\033[1m\e[00;31m***\e[00m Don't forget to copy the Bowtie report into the staging DB !!! \e[00;31m***\e[00m\033[0m"
				echo
				echo -e "   - Bowtie report in \e[00;31m$REPORT\e[00m"
			else
				echo 
				echo -e "\033[1m\e[00;31m***\e[00m Don't forget to copy the Bowtie2 report into the staging DB !!! \e[00;31m***\e[00m\033[0m"
				echo
				echo -e "   - Bowtie2 report in \e[00;31m$REPORT\e[00m"
			fi		
			;;
		* )
			echo "==> Unknown or No reference genome given. Exiting... Bowtie2 not run."
			;;
	esac
}

doQC(){
	FASTQ=$1
	PAIRED=$2
	GENOME=$3
	
	if [[ $PAIRED == 0 ]]
	then
		fastqc --quiet -f fastq $GSM'.fastq'
	else
		if [[ $use_parallel == "yes" ]]; then
			parallel --gnu 'fastqc --quiet -f fastq {}' ::: *.fastq
		else
			for filename in ./*.fastq; do 
				fastqc --quiet -f fastq "${filename}" 
			done
		fi
	fi
	
	# In case of multiple fastQ files for paired data you have to process the fastQC/trim_galore step differently.
	# In case of merged files you already ran fastQC and rtimgalore so no need for this step.
	
	if [[ $trimLevel == "WARN" ]]; then
		ERROR="$(grep -h -E $trimLevel$'\tOverrepresented sequences' *_fastqc/summary.txt | wc -l)"
		if [[ $ERROR == 0 ]]; then
			ERROR="$(grep -h -E FAIL$'\tOverrepresented sequences' *_fastqc/summary.txt | wc -l)"
		fi
	else
		ERROR="$(grep -h -E $trimLevel$'\tOverrepresented sequences' *_fastqc/summary.txt | wc -l)"
	fi
	
	if [[ $PAIRED == 0 ]]; then
		
		if [[ $ERROR != 0 ]]; then
			echo -e '\033[1m\e[00;31m   - fastQC FAIL   Overrepresented sequences\e[00m (adapters)\033[0m'
			echo  
			echo '==> Running trimGalore'
			
			# cleaning if get_data was interupted.
			FILE_TEST=`test -f *_trimmed.fq && echo 1`
			if [[ ! -z $FILE_TEST ]]; then
				rm -f *_trimmed.fq
			fi
			
			adapterArray=( $(awk '/#Sequence\tCount\tPercentage\tPossible Source/{flag=1;next} />>END_MODULE/{flag=0} flag {print}' */fastqc_data.txt | cut -f 1))
			
			## remove the unspecific sequence from array
			## those pattern remove all sequences from fastQ files
			# declare -a adapterArray=( ${adapterArray[@]/*NNNNNNNNNNNNNNNNNNNNNNNNN/} )			
			
			## Copy the original fastQ files for backup
			echo "==> Making copy of original fastQ file before trimming..."
			mkdir originals
			cp $GSM.fastq originals/"$GSM"_original.fq
			cp "$GSM"_fastqc.zip originals/"$GSM"_original-fastqc.zip
			
			for (( i=0; i<=$(( ${#adapterArray[@]} -1 )); i++ ))
			do
				echo -e "   - Removing adapter $(($i + 1)) out of ${#adapterArray[@]}"
				echo -e "        removing: ${adapterArray[$i]}"
				# trim_galore remove the adpater if the full lengh of the adpator is found (aka: stringency)
				trim_galore --no_report_file --quality 20 -a ${adapterArray[$i]} --stringency ${#adapterArray[$i]} $GSM.fastq &>> $GSM"_trimming_report.logFile"
				mv "$GSM"_trimmed.fq "$GSM".fastq
			done
			echo '   - trim_galore done'
			# erase old fastQC report
			rm -Rf "$GSM"_fastqc*
			# run fastQC on trimmed file
			echo '   - fastQC on trimmed fastQ file...'
			fastqc --quiet -f fastq $GSM'.fastq'
			
			# If reads are still too long
			# seqtk trimfq -b 1 -e 50 GSM1074872_trimmed.fq > GSM1074872_trimmed_1-50.fq
		fi
		echo '   - fastQC reported NO overrepresented sequences'
	else
		# We are dealing with non merged data but multiple fastQ file because paired reads.
		# Loop through the fastQC report and apply trim galore on both fastQ files
		# fastq file are named SRRxxx.fastq
		if [[ $ERROR != 0 ]]; then
			echo -e '\033[1m\e[00;31m   - fastQC FAIL   Overrepresented sequences\e[00m (adapters)\033[0m'
			echo 
			
			FAULTY_FASTQ=($(grep -h -E $trimLevel$'\tOverrepresented sequences' */summary.txt | perl -pe 's/.*\t(.*)\.fastq/\1/g'))
			
			for i in $( seq 0 $((${#FAULTY_FASTQ[@]} - 1)) ); do
				echo "==> Running trimGalore on ${FAULTY_FASTQ[$i]}"
				
				# cleaning if get_data was interupted.
				FILE_TEST=`test -f ${FAULTY_FASTQ[$i]}_trimmed.fq && echo 1`
				if [[ ! -z $FILE_TEST ]]; then
					rm -f *_trimmed.fq
				fi
				
				## get fastq file second reads
				# this is convoluted... trim_galore process paired end file together and one need to give
				# the name of the other file. Basically here we loop over all the adapter sequences for the
				# first file and then over the adpater seq for the second file. I do not use the -a2 option yet.
				index=$(echo ${FAULTY_FASTQ[$i]} | perl -pe 's/.*_(.*)/\1/g')
				fileroot=$(echo ${FAULTY_FASTQ[$i]} | perl -pe 's/(.*)_.*/\1/g')
				if [[ $index == 1 ]]; then
					OTHER_SRA_FILE=$fileroot"_2.fastq"
					theFile=$(echo ${FAULTY_FASTQ[$i]} | perl -pe 's/(.*)\.fastq/\1/g')
					otherFile=$(echo $OTHER_SRA_FILE | perl -pe 's/(.*)\.fastq/\1/g')
				else
					OTHER_SRA_FILE=$fileroot"_1.fastq"
					theFile=$(echo ${FAULTY_FASTQ[$i]} | perl -pe 's/(.*)\.fastq/\1/g')
					otherFile=$(echo $OTHER_SRA_FILE | perl -pe 's/(.*)\.fastq/\1/g')
				fi
				
				## Copy the original fastQ files for backup
				# Even if only one of the paired-end file contain adapter we treat both file at the same time.
				# We follow the recommendation of trimGalore. Because we trim both file_1 and file_2  we backup both.
				echo "==> Making copy of original fastQ file before trimming..."
				mkdir originals
				cp ${FAULTY_FASTQ[$i]}.fastq originals/${FAULTY_FASTQ[$i]}_original.fq
				cp ${FAULTY_FASTQ[$i]}_fastqc.zip originals/${FAULTY_FASTQ[$i]}_original-fastqc.zip
				cp $OTHER_SRA_FILE originals/$otherFile"_original.fq"
				cp $otherFile"_fastqc.zip" originals/$otherFile"_original-fastqc.zip"
									
				adapterArray=( $(awk '/#Sequence\tCount\tPercentage\tPossible Source/{flag=1;next} />>END_MODULE/{flag=0} flag {print}' ${FAULTY_FASTQ[$i]}"_fastqc"/fastqc_data.txt | cut -f 1))
				
				## remove the unspecific sequence from array
				# those pattern remove al sequences from fastQ files
				# declare -a adapterArray=( ${adapterArray[@]/*NNNNNNNNNNNNNNNNNNNNNNNNN/} )			
				
				for (( j=0; j<=$(( ${#adapterArray[@]} -1 )); j++ ))
				do
					echo -e "   - Removing adapter $(($j + 1)) out of ${#adapterArray[@]}"
					echo -e "        removing: ${adapterArray[$i]}"
					# trim_galore remove the adpater if the full lengh of the adpator is found (aka: stringency)			
					trim_galore --no_report_file --paired --quality 20 -a ${adapterArray[$j]} --stringency ${#adapterArray[$j]} ${FAULTY_FASTQ[$i]}.fastq $OTHER_SRA_FILE &>> $theFile"_trimming_report.logFile"
					
					## here we need the suffix of the *_val_* file
					index=$(echo ${FAULTY_FASTQ[$i]}_val_*.fq | perl -pe 's/.*_val_(.*).fq/\1/g')
					if [[ $index == 1 ]]; then
						mv "${FAULTY_FASTQ[$i]}"_val_1.fq "${FAULTY_FASTQ[$i]}".fastq
						mv $otherFile"_val_2.fq" $OTHER_SRA_FILE
					else
						mv "${FAULTY_FASTQ[$i]}"_val_2.fq "${FAULTY_FASTQ[$i]}".fastq
						mv $otherFile"_val_1.fq" $OTHER_SRA_FILE
					fi
				done
				echo '   - trim_galore done'
				
				# erase old fastQC report. Original fastqc report is left intact.
				rm -Rf *_fastqc*
				
			done # for faulty
			
			echo '   - fastQC on trimmed fastQ file...'
			if [[ $use_parallel == "yes" ]]; then
				parallel --gnu 'fastqc --quiet -f fastq {}' ::: *.fastq
			else
				for filename in $PWD/*.fastq; do 
					fastqc --quiet -f fastq $filename
				done
			fi
		else
			echo '   - fastQC reported NO overrepresented sequences'
		fi # if ERROR
		
	fi # IF PAIRED
	
	## get the PHRED info from fastQC report
	encoding=($(grep -h -E "^Encoding(.*)" */fastqc_data.txt | sort | uniq | perl -pe 's/Encoding\s+(.*)/\1/g'))
	# transform array to string
	encoding=${encoding[@]}
	echo "   - fastQ format: "$encoding"<<"
	if [[ $encoding == "Sanger / Illumina 1.9" ]] || [[ $encoding == "Sanger" ]]; then
		echo '   - Quality encoding: phred33'
		QUALITY="--phred33"
	elif [[ $encoding == "Illumina 1.5" ]] || [[ $encoding == "Illumina <1.3" ]] || [[ $encoding == "Illumina 1.3" ]]; then
		echo '   - Quality encoding: phred64'
		QUALITY="--phred64"
	else
		echo "\033[1m\e[00;31m***\e[00m fastQ format encoding not recognised! Report fastQC report to developer. Assuming default --phred33 \e[00;31m***\e[00m\033[0m"
		QUALITY="--phred33"
		# QUALITY="--solexa-quals"
	fi
}

GSE=
GSM=
GENOME=
FILE=
FASTQ=
PAIRED=0
FASTA=0

# Note that the : after an option flag means that it should have a value instead of
# just being the boolean flag that a is.
# OPTS=`getopt -o hg:m:s:x: --long help,trimmed -- "$@"`
OPTS=`getopt -o hg:m:x:i: --long help,paired,fasta -- "$@"`
if [ $? != 0 ]
then
	# something went wrong, getopt will put out an error message for us
    exit 1
fi

eval set -- "$OPTS"

while true
do
	case "$1" in
        -h | --help)
            usage
			;;
		# for options with required arguments, an additional shift is required
        -g)
            GSE=$2
			shift 2;;
        -m)
            GSM=$2
			shift 2;;
		-x)
			GENOME=$2
			shift 2;;
		-i) 
			FILE=$2
			shift 2;;
		--paired)
			PAIRED=1
			shift;;
		--fasta)
			FASTA=1
			shift;;
		--) break;;
		--*) break;;
        -?)
            usage
            ;;
    esac
done


if [[ -z $GSE ]] || [[ -z $GSM ]] || [[ -z $FILE ]]|| [[ -z $GENOME ]]; then
	   usage
else
	if [[ $PAIRED == 0 ]]; then

		mkdir -p $GSE/$GSM
		mv $FILE $GSE/$GSM/$GSM".fastq"	
		cd $GSE/$GSM
	else
		mkdir -p $GSE/$GSM
		mv $FILE".r_1.fastq" $GSE/$GSM/$GSM".r_1.fastq" 
		mv $FILE".r_2.fastq" $GSE/$GSM/$GSM".r_2.fastq"
		cd $GSE/$GSM
	fi
	##############
	## fastQC
	##############
	echo 
	echo '==> fastQC on fastQ file...'
	doQC $GSM $PAIRED $GENOME

	##############
	## BOWTIE2
	##############
	do_bowtie
fi	


echo 
echo '==> Summary:'
echo 
cd ../..
tree -shFL 1 $GSE/$GSM

echo 
echo -e "\033[1m\e[00;31mdone.\e[00m\033[0m"

ATAC-seq/peakCalling.sh

deleted100644 → 0
+0 −5
Original line number Diff line number Diff line
#!/bin/bash
REPORT=MACS.$3.log #$3 is pvalue
mkdir -p $1/$2/PeakCalling/MACS.NoModel.NoLambda
cd $1/$2/PeakCalling/MACS.NoModel.NoLambda
run_MACS.NoModel.NoLambda.pl $2.nodup.BED $3  2> $REPORT

ATAC-seq/remdup.sh

deleted100644 → 0
+0 −10
Original line number Diff line number Diff line
#!/bin/bash

GSE=$1
GSM=$2
fileName=$GSE/$GSM/$GSM
picard-tools SortSam INPUT=$fileName.sam OUTPUT=$fileName.sorted.bam SO=coordinate
picard-tools MarkDuplicates INPUT=$fileName.sorted.bam OUTPUT=$fileName.nodup.bam REMOVE_DUPLICATES=true METRICS_FILE=$fileName.metricN.log  VALIDATION_STRINGENCY=LENIENT
samtools index $fileName.nodup.bam
samtools view -bhS $fileName.sam > $fileName.bam
rm $fileName.sam $fileName.sorted.bam
+0 −45
Original line number Diff line number Diff line
#! /usr/bin/perl -w

use strict;


my $input = $ARGV[0]; # .BED file (sample file)
chomp $input;
my $val = $ARGV[1]; # p value
chomp $val;

#my $control = $ARGV[1]; # control file in .BED format if present
#chomp $control;



unless ($input =~ m/\.BED$/)
	{print "\n\nInput must be .BED\n\n"; exit;}

my $inputFile="../../".$input;
my @filename1 = split(/\.BED/, $input);


my $name = join ("", @filename1);
print "\n Processing $name\n\n";


my $cmd = "";
my $genome ="mm10";
my $gSize="mm";

my $name2 = $name.".NoModel.NoLambda_p".$val;
$cmd = "macs2 callpeak -t $inputFile  -g $gSize -n $name2 -f BED -p $val --nomodel --nolambda  --bdg ";
print "\n\nRunning command: $cmd\n\n";
system($cmd);

my $new_input = $name2."_peaks.xls";
my $new_output = $name2."_400bp.bed";
$cmd = "peak_finding_result2bed_sv.pl $new_input $new_output 4 $genome " ;
#system($cmd);


#print "\n $name FINISHED. Results in ./MACS2/$folder_tmp3/\n\n";
print "\n $name FINISHED.\n\n";

exit;
Loading