Add files via upload (26f95562) · Commits · Chaos / 3D_chromatin_in_AML

ATAC-seq/README.md

0 → 100644

+22 −0

Original line number	Diff line number	Diff line
		# ATAC-seq data mapping and peak calling

		Chromatin accessibility probed by ATAC-seq was analysed in like manner as H3K27ac ChIP-seq analysis, including the procedures of trimming, mapping, filtering and peak calling. Briefly, trimmed sequences were mapped against mm10 reference genome using Bowtie2 and only uniquely mapped reads were kept. Peaks were called using MACS2 with the setting “-nomodel -nolambda” and only those with p-values less than 1e-20 were considered significant. A list of ATAC-seq consensus peak set was made using DiffBind.

		## reads mapping
		get_data.sh
		run_getData.sh

		## remove duplicates
		remdup.sh
		run_remdup.sh

		## peakcalling
		peakCalling.sh
		run_peakCalling.sh
		run_MACS.NoModel.Nolambda.pl

ATAC-seq/get_data.sh

0 → 100644

+432 −0

Original line number	Diff line number	Diff line
		#!/bin/bash
		######################################
		# 2013-05-13
		# David Ruau, Department of Haematology
		# Gottgens lab
		# CIMR, University of Cambridge
		# All right reserved.
		# Licence: GPL (>=2)
		######################################

		######################################
		# GLOBAL VARIABLES
		######################################
		# This is the location where the reference genome for bowtie2 are stored
		resource='/serenity/data/reference-genomes/'
		# and this for bowtie1, only considered to process human samples
		resource_bw1='/serenity/data/reference-genomes/'
		# The GNU parallel utility is not working on all platforms.
		# set to no if you do not want to use parallel
		use_parallel='yes'
		# Sensitivity level for trimGalore
		# Choose either WARN or FAIL (default)
		trimLevel="FAIL"

		usage(){
		echo
		echo -e '\033[1m ChIP-seq /ATAC pipeline\033[0m'
		echo
		echo -e '\033[1mNAME\033[0m'
		echo ' get_data_local'
		echo
		echo -e '\033[1mSYNOPSIS\033[0m'
		echo ' usage: get_data_local [-g top_folder] [-m sub_folder] [-x ref_genome] [-i original fastq] [--merged] [--paired] [--fasta]'
		echo
		echo -e '\033[1mDESCRIPTION\033[0m'
		echo ' This utility will process raw sequence data from ChIP-seq/ATAC experiments in fastQ format.'
		echo ' When processing data, it will run FastQC quality check on each fastQ file,'
		echo ' trim the adpaters if present and run the aligner Bowtie2.'
		echo ' Human samples processed with the hg19 reference genome will be aligned with Bowtie 0.12.9.'
		echo ' It can process fasta file with --fasta options'
		echo ' You have to specify if the data are paired-end using the --paired option.'
		echo
		echo ' The options are as follows:'
		echo
		echo -e ' \033[1m-h, --help\033[0m'
		echo ' Show this message'
		echo
		echo -e ' \033[1m-g \033[4mcommand\033[0m'
		echo ' The top folder to fastQ files'
		echo
		echo -e ' \033[1m-m \033[4mcommand\033[0m'
		echo ' The sub folder to fastQ files'
		echo
		echo -e ' \033[1m-i \033[4mcommand\033[0m'
		echo ' Sample Name'
		echo
		echo -e ' \033[1m-x \033[4mcommand\033[0m'
		echo ' [optional] Reference genome: mm10, mm9, hg19, rn4 or rn5. If empty program will quit after fastQC'
		echo
		echo -e ' \033[1m--paired\033[0m'
		echo ' [optional] Flag indicating that we are dealing with paired sequence reads.'
		echo
		echo -e ' \033[1m--fasta\033[0m'
		echo ' [optional] Flag indicating that we are processing fasta files. Fasta files need to be merged before hand using cat and placed in the GSE/GSM/ folder. The --fasta flag imply --local. It is used to specify that the "local" files are in fasta format and not in fastQ format.'
		echo
		echo ' # To process local reads in fastQ or fasta formats.'
		echo ' # The files need to be in: top_folder/sub_folder/file.fastq or top_folder/sub_folder/file.fa'
		echo ' get_data_local -g top_folder -m sub_folder -i original_fastq -x mm10'
		echo ' get_data_local -g top_folder -m sub_folder -i original_fastq -x mm10 --paired'
		echo ' get_data_local --fasta -g top_folder -m sub_folder -i original_fastq -x hg19'
		echo
		echo -e '\033[1mAUTHOR\033[0m'
		echo ' David Ruau <davidruau@gmail.com>'
		echo ' Department of Haematology, Gottgens lab'
		echo ' CIMR, University of Cambridge'
		echo ' Licence: GPL (>=2)'
		echo
		echo '02 Oct, 2013'
		exit 1
		}

		do_bowtie(){
		case "$GENOME" in
		mm10\|mm9\|hg19\|rn4\|rn5 )

		if [[ $GENOME == "hg19" ]] ; then
		echo '==> Running Bowtie 0.12.9...'
		else
		echo '==> Running Bowtie2...'
		fi

		REPORT="report_bowtie_"$GSM".txt"
		samFile=$GSM'_full.sam'

		if [[ $PAIRED == 0 ]]; then
		## ALIGNMENT TO REFERENCE GENOME STEP
		# For mouse dataset we use bowtie2 with the options described at
		# http://haemcode.stemcells.cam.ac.uk/doc_pipeline.php
		# -k 2 tell bowtie2 to keep reads with at most 2 match
		# -N 1 allow for one mismatch in the seed (20bp default).
		# the rest of the options is just for speed
		if [[ $FASTA == 1 ]];then
		echo ' - Processing FASTA sequence reads'
		if [[ $GENOME == "hg19" ]] ; then
		echo 'bowtie -m 2 -v 1 --best --strata --seed 0 '$resource_bw1$GENOME '-f *.fa -S '$samFile '2> '$REPORT
		bowtie -m 2 -v 1 --best --strata --seed 0 $resource_bw1$GENOME -f *.fa -S $samFile 2> $REPORT
		else
		echo 'bowtie2 -k 2 -N 1 --mm -x '$resource$GENOME '-f -U *.fa -S' $samFile '-p 20 2>' $REPORT
		bowtie2 -k 2 -N 1 --mm -x $resource$GENOME -f -U *.fa -S $samFile -p 20 2> $REPORT
		fi

		else
		echo ' - Processing '$GSM'.fastq'
		if [[ $GENOME == "hg19" ]] ; then
		echo 'bowtie -m 2 -v 1 --best --strata --seed 0' $resource_bw1$GENOME '-q' $GSM'.fastq -S' $samFile '2>' $REPORT
		bowtie -m 2 -v 1 --best --strata --seed 0 $resource_bw1$GENOME -q $GSM.fastq -S $samFile 2> $REPORT
		else
		echo 'bowtie2 -k 2 -N 1 --mm -x' $resource$GENOME '-U' $GSM'.fastq' '-S' $samFile '-p 20 2> '$REPORT
		bowtie2 -k 2 -N 1 --mm -x $resource$GENOME -U $GSM'.fastq' -S $samFile -p 20 2> $REPORT
		fi
		fi
		## We remove the lines of the SAM file indicating the read matched multiple location. Keeping
		# only uniquely mappable reads. The awk command filter for reads that actually map the forward
		# strand (0) and the reverse strand (16) and also keep the header (@).
		grep -v XS:i: $samFile \| awk '{if($2==0 \|\| $2==16 \|\| $1~/^@/) print $0}' > $GSM'.sam'

		else
		# IDENTIFY the fastQ files
		fileArray=(`ls *.fastq`)
		echo " - Processing paired data ${fileArray[0]} and ${fileArray[1]}"
		if [[ $GENOME == "hg19" ]] ; then
		bowtie -m 2 -v 1 --best --strata --seed 0 $resource_bw1$GENOME -q -1 ${fileArray[0]} -2 ${fileArray[1]} -S $samFile 2> $REPORT
		else
		bowtie2 -k 2 -N 1 --mm --no-mixed --no-discordant -x $resource$GENOME -1 ${fileArray[0]} -2 ${fileArray[1]} -S $samFile -p 20 2> $REPORT
		fi
		## We remove the lines of the SAM file indicating the read matched multiple location. Keeping
		# only uniquely mappable reads. The awk command filter for reads mate mapping uniquely the
		# forward and reverse strand and also keep the headers.
		grep chr $samFile \| grep -v "XS:i:" \| awk '{if($2==147 \|\| $2==83 \|\| $2==99 \|\| $2==163 \|\| $2==81 \|\| $2==97 \|\| $2==145 \|\| $2==161 \|\| $1~/^@/) print $0}' > $GSM.sam
		fi
		rm $samFile
		if [[ $GENOME == "hg19" ]] ; then
		echo
		echo -e "\033[1m\e[00;31m*\e[00m Human samples are processed with Bowtie 0.12.9!!! \e[00;31m*\e[00m\033[0m"
		echo
		echo -e "\033[1m\e[00;31m*\e[00m Don't forget to copy the Bowtie report into the staging DB !!! \e[00;31m*\e[00m\033[0m"
		echo
		echo -e " - Bowtie report in \e[00;31m$REPORT\e[00m"
		else
		echo
		echo -e "\033[1m\e[00;31m*\e[00m Don't forget to copy the Bowtie2 report into the staging DB !!! \e[00;31m*\e[00m\033[0m"
		echo
		echo -e " - Bowtie2 report in \e[00;31m$REPORT\e[00m"
		fi
		;;
		* )
		echo "==> Unknown or No reference genome given. Exiting... Bowtie2 not run."
		;;
		esac
		}

		doQC(){
		FASTQ=$1
		PAIRED=$2
		GENOME=$3

		if [[ $PAIRED == 0 ]]
		then
		fastqc --quiet -f fastq $GSM'.fastq'
		else
		if [[ $use_parallel == "yes" ]]; then
		parallel --gnu 'fastqc --quiet -f fastq {}' ::: *.fastq
		else
		for filename in ./*.fastq; do
		fastqc --quiet -f fastq "${filename}"
		done
		fi
		fi

		# In case of multiple fastQ files for paired data you have to process the fastQC/trim_galore step differently.
		# In case of merged files you already ran fastQC and rtimgalore so no need for this step.

		if [[ $trimLevel == "WARN" ]]; then
		ERROR="$(grep -h -E $trimLevel$'\tOverrepresented sequences' *_fastqc/summary.txt \| wc -l)"
		if [[ $ERROR == 0 ]]; then
		ERROR="$(grep -h -E FAIL$'\tOverrepresented sequences' *_fastqc/summary.txt \| wc -l)"
		fi
		else
		ERROR="$(grep -h -E $trimLevel$'\tOverrepresented sequences' *_fastqc/summary.txt \| wc -l)"
		fi

		if [[ $PAIRED == 0 ]]; then

		if [[ $ERROR != 0 ]]; then
		echo -e '\033[1m\e[00;31m - fastQC FAIL Overrepresented sequences\e[00m (adapters)\033[0m'
		echo
		echo '==> Running trimGalore'

		# cleaning if get_data was interupted.
		FILE_TEST=`test -f *_trimmed.fq && echo 1`
		if [[ ! -z $FILE_TEST ]]; then
		rm -f *_trimmed.fq
		fi

		adapterArray=( $(awk '/#Sequence\tCount\tPercentage\tPossible Source/{flag=1;next} />>END_MODULE/{flag=0} flag {print}' */fastqc_data.txt \| cut -f 1))

		## remove the unspecific sequence from array
		## those pattern remove all sequences from fastQ files
		# declare -a adapterArray=( ${adapterArray[@]/*NNNNNNNNNNNNNNNNNNNNNNNNN/} )

		## Copy the original fastQ files for backup
		echo "==> Making copy of original fastQ file before trimming..."
		mkdir originals
		cp $GSM.fastq originals/"$GSM"_original.fq
		cp "$GSM"_fastqc.zip originals/"$GSM"_original-fastqc.zip

		for (( i=0; i<=$(( ${#adapterArray[@]} -1 )); i++ ))
		do
		echo -e " - Removing adapter $(($i + 1)) out of ${#adapterArray[@]}"
		echo -e " removing: ${adapterArray[$i]}"
		# trim_galore remove the adpater if the full lengh of the adpator is found (aka: stringency)
		trim_galore --no_report_file --quality 20 -a ${adapterArray[$i]} --stringency ${#adapterArray[$i]} $GSM.fastq &>> $GSM"_trimming_report.logFile"
		mv "$GSM"_trimmed.fq "$GSM".fastq
		done
		echo ' - trim_galore done'
		# erase old fastQC report
		rm -Rf "$GSM"_fastqc*
		# run fastQC on trimmed file
		echo ' - fastQC on trimmed fastQ file...'
		fastqc --quiet -f fastq $GSM'.fastq'

		# If reads are still too long
		# seqtk trimfq -b 1 -e 50 GSM1074872_trimmed.fq > GSM1074872_trimmed_1-50.fq
		fi
		echo ' - fastQC reported NO overrepresented sequences'
		else
		# We are dealing with non merged data but multiple fastQ file because paired reads.
		# Loop through the fastQC report and apply trim galore on both fastQ files
		# fastq file are named SRRxxx.fastq
		if [[ $ERROR != 0 ]]; then
		echo -e '\033[1m\e[00;31m - fastQC FAIL Overrepresented sequences\e[00m (adapters)\033[0m'
		echo

		FAULTY_FASTQ=($(grep -h -E $trimLevel$'\tOverrepresented sequences' /summary.txt \| perl -pe 's/.\t(.*)\.fastq/\1/g'))

		for i in $( seq 0 $((${#FAULTY_FASTQ[@]} - 1)) ); do
		echo "==> Running trimGalore on ${FAULTY_FASTQ[$i]}"

		# cleaning if get_data was interupted.
		FILE_TEST=`test -f ${FAULTY_FASTQ[$i]}_trimmed.fq && echo 1`
		if [[ ! -z $FILE_TEST ]]; then
		rm -f *_trimmed.fq
		fi

		## get fastq file second reads
		# this is convoluted... trim_galore process paired end file together and one need to give
		# the name of the other file. Basically here we loop over all the adapter sequences for the
		# first file and then over the adpater seq for the second file. I do not use the -a2 option yet.
		index=$(echo ${FAULTY_FASTQ[$i]} \| perl -pe 's/._(.)/\1/g')
		fileroot=$(echo ${FAULTY_FASTQ[$i]} \| perl -pe 's/(.)_./\1/g')
		if [[ $index == 1 ]]; then
		OTHER_SRA_FILE=$fileroot"_2.fastq"
		theFile=$(echo ${FAULTY_FASTQ[$i]} \| perl -pe 's/(.*)\.fastq/\1/g')
		otherFile=$(echo $OTHER_SRA_FILE \| perl -pe 's/(.*)\.fastq/\1/g')
		else
		OTHER_SRA_FILE=$fileroot"_1.fastq"
		theFile=$(echo ${FAULTY_FASTQ[$i]} \| perl -pe 's/(.*)\.fastq/\1/g')
		otherFile=$(echo $OTHER_SRA_FILE \| perl -pe 's/(.*)\.fastq/\1/g')
		fi

		## Copy the original fastQ files for backup
		# Even if only one of the paired-end file contain adapter we treat both file at the same time.
		# We follow the recommendation of trimGalore. Because we trim both file_1 and file_2 we backup both.
		echo "==> Making copy of original fastQ file before trimming..."
		mkdir originals
		cp ${FAULTY_FASTQ[$i]}.fastq originals/${FAULTY_FASTQ[$i]}_original.fq
		cp ${FAULTY_FASTQ[$i]}_fastqc.zip originals/${FAULTY_FASTQ[$i]}_original-fastqc.zip
		cp $OTHER_SRA_FILE originals/$otherFile"_original.fq"
		cp $otherFile"_fastqc.zip" originals/$otherFile"_original-fastqc.zip"

		adapterArray=( $(awk '/#Sequence\tCount\tPercentage\tPossible Source/{flag=1;next} />>END_MODULE/{flag=0} flag {print}' ${FAULTY_FASTQ[$i]}"_fastqc"/fastqc_data.txt \| cut -f 1))

		## remove the unspecific sequence from array
		# those pattern remove al sequences from fastQ files
		# declare -a adapterArray=( ${adapterArray[@]/*NNNNNNNNNNNNNNNNNNNNNNNNN/} )

		for (( j=0; j<=$(( ${#adapterArray[@]} -1 )); j++ ))
		do
		echo -e " - Removing adapter $(($j + 1)) out of ${#adapterArray[@]}"
		echo -e " removing: ${adapterArray[$i]}"
		# trim_galore remove the adpater if the full lengh of the adpator is found (aka: stringency)
		trim_galore --no_report_file --paired --quality 20 -a ${adapterArray[$j]} --stringency ${#adapterArray[$j]} ${FAULTY_FASTQ[$i]}.fastq $OTHER_SRA_FILE &>> $theFile"_trimming_report.logFile"

		## here we need the suffix of the _val_ file
		index=$(echo ${FAULTY_FASTQ[$i]}_val_.fq \| perl -pe 's/._val_(.*).fq/\1/g')
		if [[ $index == 1 ]]; then
		mv "${FAULTY_FASTQ[$i]}"_val_1.fq "${FAULTY_FASTQ[$i]}".fastq
		mv $otherFile"_val_2.fq" $OTHER_SRA_FILE
		else
		mv "${FAULTY_FASTQ[$i]}"_val_2.fq "${FAULTY_FASTQ[$i]}".fastq
		mv $otherFile"_val_1.fq" $OTHER_SRA_FILE
		fi
		done
		echo ' - trim_galore done'

		# erase old fastQC report. Original fastqc report is left intact.
		rm -Rf _fastqc

		done # for faulty

		echo ' - fastQC on trimmed fastQ file...'
		if [[ $use_parallel == "yes" ]]; then
		parallel --gnu 'fastqc --quiet -f fastq {}' ::: *.fastq
		else
		for filename in $PWD/*.fastq; do
		fastqc --quiet -f fastq $filename
		done
		fi
		else
		echo ' - fastQC reported NO overrepresented sequences'
		fi # if ERROR

		fi # IF PAIRED

		## get the PHRED info from fastQC report
		encoding=($(grep -h -E "^Encoding(.)" /fastqc_data.txt \| sort \| uniq \| perl -pe 's/Encoding\s+(.*)/\1/g'))
		# transform array to string
		encoding=${encoding[@]}
		echo " - fastQ format: "$encoding"<<"
		if [[ $encoding == "Sanger / Illumina 1.9" ]] \|\| [[ $encoding == "Sanger" ]]; then
		echo ' - Quality encoding: phred33'
		QUALITY="--phred33"
		elif [[ $encoding == "Illumina 1.5" ]] \|\| [[ $encoding == "Illumina <1.3" ]] \|\| [[ $encoding == "Illumina 1.3" ]]; then
		echo ' - Quality encoding: phred64'
		QUALITY="--phred64"
		else
		echo "\033[1m\e[00;31m*\e[00m fastQ format encoding not recognised! Report fastQC report to developer. Assuming default --phred33 \e[00;31m*\e[00m\033[0m"
		QUALITY="--phred33"
		# QUALITY="--solexa-quals"
		fi
		}

		GSE=
		GSM=
		GENOME=
		FILE=
		FASTQ=
		PAIRED=0
		FASTA=0

		# Note that the : after an option flag means that it should have a value instead of
		# just being the boolean flag that a is.
		# OPTS=`getopt -o hg:m:s:x: --long help,trimmed -- "$@"`
		OPTS=`getopt -o hg:m:x:i: --long help,paired,fasta -- "$@"`
		if [ $? != 0 ]
		then
		# something went wrong, getopt will put out an error message for us
		exit 1
		fi

		eval set -- "$OPTS"

		while true
		do
		case "$1" in
		-h \| --help)
		usage
		;;
		# for options with required arguments, an additional shift is required
		-g)
		GSE=$2
		shift 2;;
		-m)
		GSM=$2
		shift 2;;
		-x)
		GENOME=$2
		shift 2;;
		-i)
		FILE=$2
		shift 2;;
		--paired)
		PAIRED=1
		shift;;
		--fasta)
		FASTA=1
		shift;;
		--) break;;
		--*) break;;
		-?)
		usage
		;;
		esac
		done


		if [[ -z $GSE ]] \|\| [[ -z $GSM ]] \|\| [[ -z $FILE ]]\|\| [[ -z $GENOME ]]; then
		usage
		else
		if [[ $PAIRED == 0 ]]; then

		mkdir -p $GSE/$GSM
		mv $FILE $GSE/$GSM/$GSM".fastq"
		cd $GSE/$GSM
		else
		mkdir -p $GSE/$GSM
		mv $FILE".r_1.fastq" $GSE/$GSM/$GSM".r_1.fastq"
		mv $FILE".r_2.fastq" $GSE/$GSM/$GSM".r_2.fastq"
		cd $GSE/$GSM
		fi
		##############
		## fastQC
		##############
		echo
		echo '==> fastQC on fastQ file...'
		doQC $GSM $PAIRED $GENOME

		##############
		## BOWTIE2
		##############
		do_bowtie
		fi


		echo
		echo '==> Summary:'
		echo
		cd ../..
		tree -shFL 1 $GSE/$GSM

		echo
		echo -e "\033[1m\e[00;31mdone.\e[00m\033[0m"

ATAC-seq/peakCalling.sh

0 → 100644

+5 −0

Original line number	Diff line number	Diff line
		#!/bin/bash
		REPORT=MACS.$3.log #$3 is pvalue
		mkdir -p $1/$2/PeakCalling/MACS.NoModel.NoLambda
		cd $1/$2/PeakCalling/MACS.NoModel.NoLambda
		run_MACS.NoModel.NoLambda.pl $2.nodup.BED $3 2> $REPORT

ATAC-seq/remdup.sh

0 → 100644

+10 −0

Original line number	Diff line number	Diff line
		#!/bin/bash

		GSE=$1
		GSM=$2
		fileName=$GSE/$GSM/$GSM
		picard-tools SortSam INPUT=$fileName.sam OUTPUT=$fileName.sorted.bam SO=coordinate
		picard-tools MarkDuplicates INPUT=$fileName.sorted.bam OUTPUT=$fileName.nodup.bam REMOVE_DUPLICATES=true METRICS_FILE=$fileName.metricN.log VALIDATION_STRINGENCY=LENIENT
		samtools index $fileName.nodup.bam
		samtools view -bhS $fileName.sam > $fileName.bam
		rm $fileName.sam $fileName.sorted.bam

ATAC-seq/run_MACS.NoModel.NoLambda.pl

0 → 100644

+45 −0

Original line number	Diff line number	Diff line
		#! /usr/bin/perl -w

		use strict;


		my $input = $ARGV[0]; # .BED file (sample file)
		chomp $input;
		my $val = $ARGV[1]; # p value
		chomp $val;

		#my $control = $ARGV[1]; # control file in .BED format if present
		#chomp $control;



		unless ($input =~ m/\.BED$/)
		{print "\n\nInput must be .BED\n\n"; exit;}

		my $inputFile="../../".$input;
		my @filename1 = split(/\.BED/, $input);


		my $name = join ("", @filename1);
		print "\n Processing $name\n\n";


		my $cmd = "";
		my $genome ="mm10";
		my $gSize="mm";

		my $name2 = $name.".NoModel.NoLambda_p".$val;
		$cmd = "macs2 callpeak -t $inputFile -g $gSize -n $name2 -f BED -p $val --nomodel --nolambda --bdg ";
		print "\n\nRunning command: $cmd\n\n";
		system($cmd);

		my $new_input = $name2."_peaks.xls";
		my $new_output = $name2."_400bp.bed";
		$cmd = "peak_finding_result2bed_sv.pl $new_input $new_output 4 $genome " ;
		#system($cmd);


		#print "\n $name FINISHED. Results in ./MACS2/$folder_tmp3/\n\n";
		print "\n $name FINISHED.\n\n";

		exit;

Admin message