Unverified Commit 3d4f4df3 authored by hy298's avatar hy298 Committed by GitHub
Browse files

Add files via upload

parent ad6a56a8
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
# Bivariate-Genomic-Footprinting

Code for footprinting and motif-flanking accessibility using BaGFoot (Baek et al)

aux.R
bagfoot.R

Baek S, et al (2017) ***Genomic Footprinting Detects Changes in Transcription Factor Activity*** ([Cell Rep 19, 1710-1722](https://pubmed.ncbi.nlm.nih.gov/28538187/)).

BagFoot_analysis/aux.R

0 → 100644
+3286 −0

File added.

Preview size limit exceeded, changes collapsed.

+92 −0
Original line number Diff line number Diff line
#install.packages("hash")
#install.packages("digest")
#install.packages("data.table")
#install.packages("Cairo")
#install.packages("aplpack")
#source("https://bioconductor.org/biocLite.R")
#biocLite("BSgenome.Hsapiens.UCSC.hg19")
#biocLite("BSgenome.Mmusculus.UCSC.mm9")

setwd("./bagfoot1")
library(Rsamtools)
library(hash)
library(data.table)
library(digest)
library(Cairo)
library(aplpack)
library(BSgenome.Hsapiens.UCSC.hg19)
library(BSgenome.Mmusculus.UCSC.mm9)
library(BSgenome.Mmusculus.UCSC.mm10)
library(bagfoot)

source("aux.R")

bamfile1= 'LN.bam';  
cc1 = countReadsBAM(bamfile1, refgenome = "mm10");  # counts the number of cuts in the sequence file.
cc1
cutcountfile1 = makeCutCountBAM(bamfile1, refgenome = "mm10");   # generate a BedGraph file with ATAC Cleavages counts

bamfile2= 'DM.bam';
cc2 = countReadsBAM(bamfile2, refgenome = "mm10");  
cc2
cutcountfile2 = makeCutCountBAM(bamfile2, refgenome = "mm10"); 



hotspotfile1 = 'A1_peaks.csv';   # A peak sites file is comma-separated and must has headers with chromosome, start, end (1-based)
hotspotfile2 = 'A1_peaks.csv';   # this can be different, or same, set of peaks
combinedhotspotfile = combineTwoHotspots( hotspotfile1, hotspotfile2, 'LN', 'DM');

#for bedtools multicov
pooledBed <- read.csv('pooled_LN_DM_hotspot.csv', header=TRUE)
write.table(x=pooledBed[,2:4], file = "pooledBed.bed", append = FALSE, quote = FALSE, sep = "\t", row.names =FALSE, col.names = FALSE)

system(' bedtools multicov -bams LN.bam -bed pooledBed.bed > NcountsFile1.bed')
system(' bedtools multicov -bams DM.bam -bed pooledBed.bed > NcountsFile2.bed')
NcountsFile1 <- sum( read.csv('NcountsFile1.bed', head=FALSE , sep="\t")[,4] ,na.rm=TRUE )
NcountsFile2 <- sum( read.csv('NcountsFile2.bed', head=FALSE , sep="\t")[,4] ,na.rm=TRUE )
NcountsFile1
NcountsFile2
rm(pooledBed)
system(' rm pooledBed.bed ' )


tabNoMappability = MakeBiasCorrectionTableBAM(  # This function call generates a hexamer bias frequency table from the given bamfile without mappability assumption
	bamfile='all.bam',
	outfile="Hexamer_all_mm10_withoutMap.txt",
	refgenome="mm10", 
	np=6, 
	mapdir='',    # if mappability file directory is set to empty, mappability is not used. 
	atac=TRUE     # Set TRUE for ATAC data. The default value is FALSE
	);


# run "BaGFoot"

LN <- CutCount(file = "./bagfoot1/LN_AC_cutcount.bgr.gz" , count = NcountsFile1, name = "LN" ); 
DM <- CutCount(file = "./bagfoot1/DM_AC_cutcount.bgr.gz" , count = NcountsFile2, name = "DM" );

motifdb_mm10 <- MotifDB(motiflistfile = 'motiflist_mm10_cisbp_filtered.txt',   # This file contains the list of FIMO outputs file
			directory='/mm10_cisbp_fimo/');	                       # Directory of FIMO outputs

gfootoption_ctrl_treat <- GFootOption(biasfile = "Hexamer_all_mm10_withoutMap.txt",                   # Hexamer bias frequency table (see "bigfoot_prep_example.R")
					hexamer_pattern= "./bagfoot1/nuccode_mm10_6mer_{chr}.dat");   # nuc. code files generated by MakeBiasCorrectionTableBAM 


# Definition of BiGFoot analysis 
GFoot_ctrl_treat= GFoot(control = LN,      ##  Control Data defined above
			treatment = DM,    ##  Treatment Data defined above
			sitefile = "./bagfoot1/pooled_LN_DM_hotspot.csv",  ##  Sites file: Peak File.  This file must have "chr", "st", "ed" in the header.
			motifDB = motifdb_mm10,                ##  Information about FIMO motifs
			gfootoption= gfootoption_ctrl_treat,   ##  Options as defined above
			outputdir = "OUTPUT_LN_vs_DM",  ## OUTPUT directory
			cachedir = "CACHE_LN_vs_DM");   ## CACHE directory 

GFoot_ctrl_treat <- run(GFoot_ctrl_treat , graphout=T,yrange=c(-2.5,1.5), mc.cores = 24 ) 



dat <- read.table('LN_DM_On_pooled_LN_DM_hotspot_footprint_depth_table.csv');
gen_bagplot_chisq(dat, dataname1='LN', dataname2='DM', factor=2.5);   # generates a bagplot from the calculated footprinting depths

+3 −0
Original line number Diff line number Diff line
#!/bin/bash

Rscript --vanilla bagfoot.R
+46 −0
Original line number Diff line number Diff line
### Differential analysis by edgeR

library(GenomicRanges)
library(GenomicAlignments)
library("edgeR")
library("gplots")

getFit <- function(data, idx, lv, libsize){
  group <- factor(c(1,1,2,2))
  design <- model.matrix(~group)
  data <- data[,idx, drop=FALSE]
  y <- DGEList(counts=data, group=group,lib.size=libsize)
  keep <- rowSums(cpm(y)>1) >= 2
  y <- y[keep, , keep.lib.sizes=FALSE]
  y <- calcNormFactors(y)
  y <- estimateDisp(y, design)
  y <- estimateGLMCommonDisp(y, design)
  y <- estimateGLMTrendedDisp(y, design)
  y <- estimateGLMTagwiseDisp(y, design)
  fit <- glmFit(y, design)
  lrt <- glmLRT(fit, coef=2)
  DEg <- as.data.frame(topTags(lrt, n=10^6))
  write.table(DEg,file=paste0(lv[1],".",lv[2],".diff_ATAC.edgeR.csv"), sep="\t", quote=F, row.names=TRUE)
}

peakFile=read.delim("./ATAC_consensus_peaks_filter.bed", header = TRUE, sep ="\t")
peakSet=GRanges(seqnames=peakFile$chr, IRanges(peakFile$start,peakFile$end))
bamFilePath="/serenity/data/Haiyang/ATAC_HY/promoter/bamfiles/"
bamFiles = dir(bamFilePath, pattern="*.bam$", full.name=T)
aln <- lapply(bamFiles, readGAlignments)
counts <- lapply(aln, GenomicRanges::countOverlaps, query=peakSet)
names(counts) <- gsub(".nodup.bam","",basename(bamFiles))

df=data.frame(as.list(counts),stringsAsFactors=FALSE)
maxlevel=apply(df, 1, max)
rawcount=cbind(chr=seqnames(peakSet),start=start(ranges(peakSet)),end=end(ranges(peakSet)),df,Width=width(peakSet), Max=maxlevel)

liblen=lengths(aln, use.names = TRUE)
data=df
peakId <- with(peakFile, paste(chr,"_",start,"_",end,sep=""))
row.names(data)=peakId

getFit(data, idx=c(5,6,7,8), lv=c("WT", "NPM1"), libsize=liblen[c(5,6,7,8)])
getFit(data, idx=c(5,6,3,4), lv=c("WT", "FLT3"), libsize=liblen[c(5,6,3,4)])
getFit(data, idx=c(5,6,1,2), lv=c("WT", "DM"), libsize=liblen[c(5,6,1,2)])
Loading