Add files via upload (3d4f4df3) · Commits · Chaos / 3D_chromatin_in_AML

BagFoot_analysis/README.md

0 → 100644

+8 −0

Original line number	Diff line number	Diff line
		# Bivariate-Genomic-Footprinting

		Code for footprinting and motif-flanking accessibility using BaGFoot (Baek et al)

		aux.R
		bagfoot.R

		Baek S, et al (2017) *Genomic Footprinting Detects Changes in Transcription Factor Activity* ([Cell Rep 19, 1710-1722](https://pubmed.ncbi.nlm.nih.gov/28538187/)).

BagFoot_analysis/aux.R

0 → 100644

+3286 −0

File added.

Preview size limit exceeded, changes collapsed.

BagFoot_analysis/bagfoot.R

0 → 100644

+92 −0

Original line number	Diff line number	Diff line
		#install.packages("hash")
		#install.packages("digest")
		#install.packages("data.table")
		#install.packages("Cairo")
		#install.packages("aplpack")
		#source("https://bioconductor.org/biocLite.R")
		#biocLite("BSgenome.Hsapiens.UCSC.hg19")
		#biocLite("BSgenome.Mmusculus.UCSC.mm9")

		setwd("./bagfoot1")
		library(Rsamtools)
		library(hash)
		library(data.table)
		library(digest)
		library(Cairo)
		library(aplpack)
		library(BSgenome.Hsapiens.UCSC.hg19)
		library(BSgenome.Mmusculus.UCSC.mm9)
		library(BSgenome.Mmusculus.UCSC.mm10)
		library(bagfoot)

		source("aux.R")

		bamfile1= 'LN.bam';
		cc1 = countReadsBAM(bamfile1, refgenome = "mm10"); # counts the number of cuts in the sequence file.
		cc1
		cutcountfile1 = makeCutCountBAM(bamfile1, refgenome = "mm10"); # generate a BedGraph file with ATAC Cleavages counts

		bamfile2= 'DM.bam';
		cc2 = countReadsBAM(bamfile2, refgenome = "mm10");
		cc2
		cutcountfile2 = makeCutCountBAM(bamfile2, refgenome = "mm10");



		hotspotfile1 = 'A1_peaks.csv'; # A peak sites file is comma-separated and must has headers with chromosome, start, end (1-based)
		hotspotfile2 = 'A1_peaks.csv'; # this can be different, or same, set of peaks
		combinedhotspotfile = combineTwoHotspots( hotspotfile1, hotspotfile2, 'LN', 'DM');

		#for bedtools multicov
		pooledBed <- read.csv('pooled_LN_DM_hotspot.csv', header=TRUE)
		write.table(x=pooledBed[,2:4], file = "pooledBed.bed", append = FALSE, quote = FALSE, sep = "\t", row.names =FALSE, col.names = FALSE)

		system(' bedtools multicov -bams LN.bam -bed pooledBed.bed > NcountsFile1.bed')
		system(' bedtools multicov -bams DM.bam -bed pooledBed.bed > NcountsFile2.bed')
		NcountsFile1 <- sum( read.csv('NcountsFile1.bed', head=FALSE , sep="\t")[,4] ,na.rm=TRUE )
		NcountsFile2 <- sum( read.csv('NcountsFile2.bed', head=FALSE , sep="\t")[,4] ,na.rm=TRUE )
		NcountsFile1
		NcountsFile2
		rm(pooledBed)
		system(' rm pooledBed.bed ' )


		tabNoMappability = MakeBiasCorrectionTableBAM( # This function call generates a hexamer bias frequency table from the given bamfile without mappability assumption
		bamfile='all.bam',
		outfile="Hexamer_all_mm10_withoutMap.txt",
		refgenome="mm10",
		np=6,
		mapdir='', # if mappability file directory is set to empty, mappability is not used.
		atac=TRUE # Set TRUE for ATAC data. The default value is FALSE
		);


		# run "BaGFoot"

		LN <- CutCount(file = "./bagfoot1/LN_AC_cutcount.bgr.gz" , count = NcountsFile1, name = "LN" );
		DM <- CutCount(file = "./bagfoot1/DM_AC_cutcount.bgr.gz" , count = NcountsFile2, name = "DM" );

		motifdb_mm10 <- MotifDB(motiflistfile = 'motiflist_mm10_cisbp_filtered.txt', # This file contains the list of FIMO outputs file
		directory='/mm10_cisbp_fimo/'); # Directory of FIMO outputs

		gfootoption_ctrl_treat <- GFootOption(biasfile = "Hexamer_all_mm10_withoutMap.txt", # Hexamer bias frequency table (see "bigfoot_prep_example.R")
		hexamer_pattern= "./bagfoot1/nuccode_mm10_6mer_{chr}.dat"); # nuc. code files generated by MakeBiasCorrectionTableBAM


		# Definition of BiGFoot analysis
		GFoot_ctrl_treat= GFoot(control = LN, ## Control Data defined above
		treatment = DM, ## Treatment Data defined above
		sitefile = "./bagfoot1/pooled_LN_DM_hotspot.csv", ## Sites file: Peak File. This file must have "chr", "st", "ed" in the header.
		motifDB = motifdb_mm10, ## Information about FIMO motifs
		gfootoption= gfootoption_ctrl_treat, ## Options as defined above
		outputdir = "OUTPUT_LN_vs_DM", ## OUTPUT directory
		cachedir = "CACHE_LN_vs_DM"); ## CACHE directory

		GFoot_ctrl_treat <- run(GFoot_ctrl_treat , graphout=T,yrange=c(-2.5,1.5), mc.cores = 24 )



		dat <- read.table('LN_DM_On_pooled_LN_DM_hotspot_footprint_depth_table.csv');
		gen_bagplot_chisq(dat, dataname1='LN', dataname2='DM', factor=2.5); # generates a bagplot from the calculated footprinting depths

BagFoot_analysis/run_bagfoot.sh

0 → 100644

+3 −0

Original line number	Diff line number	Diff line
		#!/bin/bash

		Rscript --vanilla bagfoot.R

Differential_analysis/ATACseq_differential_analysis.R

0 → 100644

+46 −0

Original line number	Diff line number	Diff line
		### Differential analysis by edgeR

		library(GenomicRanges)
		library(GenomicAlignments)
		library("edgeR")
		library("gplots")

		getFit <- function(data, idx, lv, libsize){
		group <- factor(c(1,1,2,2))
		design <- model.matrix(~group)
		data <- data[,idx, drop=FALSE]
		y <- DGEList(counts=data, group=group,lib.size=libsize)
		keep <- rowSums(cpm(y)>1) >= 2
		y <- y[keep, , keep.lib.sizes=FALSE]
		y <- calcNormFactors(y)
		y <- estimateDisp(y, design)
		y <- estimateGLMCommonDisp(y, design)
		y <- estimateGLMTrendedDisp(y, design)
		y <- estimateGLMTagwiseDisp(y, design)
		fit <- glmFit(y, design)
		lrt <- glmLRT(fit, coef=2)
		DEg <- as.data.frame(topTags(lrt, n=10^6))
		write.table(DEg,file=paste0(lv[1],".",lv[2],".diff_ATAC.edgeR.csv"), sep="\t", quote=F, row.names=TRUE)
		}

		peakFile=read.delim("./ATAC_consensus_peaks_filter.bed", header = TRUE, sep ="\t")
		peakSet=GRanges(seqnames=peakFile$chr, IRanges(peakFile$start,peakFile$end))
		bamFilePath="/serenity/data/Haiyang/ATAC_HY/promoter/bamfiles/"
		bamFiles = dir(bamFilePath, pattern="*.bam$", full.name=T)
		aln <- lapply(bamFiles, readGAlignments)
		counts <- lapply(aln, GenomicRanges::countOverlaps, query=peakSet)
		names(counts) <- gsub(".nodup.bam","",basename(bamFiles))

		df=data.frame(as.list(counts),stringsAsFactors=FALSE)
		maxlevel=apply(df, 1, max)
		rawcount=cbind(chr=seqnames(peakSet),start=start(ranges(peakSet)),end=end(ranges(peakSet)),df,Width=width(peakSet), Max=maxlevel)

		liblen=lengths(aln, use.names = TRUE)
		data=df
		peakId <- with(peakFile, paste(chr,"_",start,"_",end,sep=""))
		row.names(data)=peakId

		getFit(data, idx=c(5,6,7,8), lv=c("WT", "NPM1"), libsize=liblen[c(5,6,7,8)])
		getFit(data, idx=c(5,6,3,4), lv=c("WT", "FLT3"), libsize=liblen[c(5,6,3,4)])
		getFit(data, idx=c(5,6,1,2), lv=c("WT", "DM"), libsize=liblen[c(5,6,1,2)])

Admin message