Delete Differential_analysis directory (2e64d2f2) · Commits · Chaos / 3D_chromatin_in_AML

Differential_analysis/ATACseq_differential_analysis.R

deleted100644 → 0

+0 −46

Original line number	Diff line number	Diff line
		### Differential analysis by edgeR

		library(GenomicRanges)
		library(GenomicAlignments)
		library("edgeR")
		library("gplots")

		getFit <- function(data, idx, lv, libsize){
		group <- factor(c(1,1,2,2))
		design <- model.matrix(~group)
		data <- data[,idx, drop=FALSE]
		y <- DGEList(counts=data, group=group,lib.size=libsize)
		keep <- rowSums(cpm(y)>1) >= 2
		y <- y[keep, , keep.lib.sizes=FALSE]
		y <- calcNormFactors(y)
		y <- estimateDisp(y, design)
		y <- estimateGLMCommonDisp(y, design)
		y <- estimateGLMTrendedDisp(y, design)
		y <- estimateGLMTagwiseDisp(y, design)
		fit <- glmFit(y, design)
		lrt <- glmLRT(fit, coef=2)
		DEg <- as.data.frame(topTags(lrt, n=10^6))
		write.table(DEg,file=paste0(lv[1],".",lv[2],".diff_ATAC.edgeR.csv"), sep="\t", quote=F, row.names=TRUE)
		}

		peakFile=read.delim("./ATAC_consensus_peaks_filter.bed", header = TRUE, sep ="\t")
		peakSet=GRanges(seqnames=peakFile$chr, IRanges(peakFile$start,peakFile$end))
		bamFilePath="/serenity/data/Haiyang/ATAC_HY/promoter/bamfiles/"
		bamFiles = dir(bamFilePath, pattern="*.bam$", full.name=T)
		aln <- lapply(bamFiles, readGAlignments)
		counts <- lapply(aln, GenomicRanges::countOverlaps, query=peakSet)
		names(counts) <- gsub(".nodup.bam","",basename(bamFiles))

		df=data.frame(as.list(counts),stringsAsFactors=FALSE)
		maxlevel=apply(df, 1, max)
		rawcount=cbind(chr=seqnames(peakSet),start=start(ranges(peakSet)),end=end(ranges(peakSet)),df,Width=width(peakSet), Max=maxlevel)

		liblen=lengths(aln, use.names = TRUE)
		data=df
		peakId <- with(peakFile, paste(chr,"_",start,"_",end,sep=""))
		row.names(data)=peakId

		getFit(data, idx=c(5,6,7,8), lv=c("WT", "NPM1"), libsize=liblen[c(5,6,7,8)])
		getFit(data, idx=c(5,6,3,4), lv=c("WT", "FLT3"), libsize=liblen[c(5,6,3,4)])
		getFit(data, idx=c(5,6,1,2), lv=c("WT", "DM"), libsize=liblen[c(5,6,1,2)])

Differential_analysis/H3K27ac_differential_analysis.R

deleted100644 → 0

+0 −45

Original line number	Diff line number	Diff line
		### Differential analysis by edgeR

		library(GenomicRanges)
		library(GenomicAlignments)
		library("edgeR")
		library("gplots")

		getFit <- function(data, idx, lv, libsize){
		group <- factor(c(1,1,2,2))
		design <- model.matrix(~group)
		data <- data[,idx, drop=FALSE]
		y <- DGEList(counts=data, group=group,lib.size=libsize)
		keep <- rowSums(cpm(y)>1) >= 2
		y <- y[keep, , keep.lib.sizes=FALSE]
		y <- calcNormFactors(y)
		y <- estimateDisp(y, design)
		y <- estimateGLMCommonDisp(y, design)
		y <- estimateGLMTrendedDisp(y, design)
		y <- estimateGLMTagwiseDisp(y, design)
		fit <- glmFit(y, design)
		lrt <- glmLRT(fit, coef=2)
		DEg <- as.data.frame(topTags(lrt, n=10^6))
		write.table(DEg,file=paste0(lv[1],".",lv[2],".diff_H3K27ac.edgeR.csv"), sep="\t", quote=F, row.names=TRUE)
		}

		peakFile=read.delim("./H3K27ac_consensus_peaks.bed", header = TRUE, sep ="\t")
		peakSet=GRanges(seqnames=peakFile$chr, IRanges(peakFile$start,peakFile$end))
		bamFilePath="/serenity/data/Haiyang/ChIP_HY/bamfiles/H3K27ac"
		bamFiles = dir(bamFilePath, pattern="*.bam$", full.name=T)
		aln <- lapply(bamFiles, readGAlignments)
		counts <- lapply(aln, GenomicRanges::countOverlaps, query=peakSet)
		names(counts) <- gsub(".nodup.bam","",basename(bamFiles))

		df=data.frame(as.list(counts),stringsAsFactors=FALSE)
		maxlevel=apply(df, 1, max)
		rawcount=cbind(chr=seqnames(peakSet),start=start(ranges(peakSet)),end=end(ranges(peakSet)),df,Width=width(peakSet), Max=maxlevel)

		liblen=lengths(aln, use.names = TRUE)
		data=df
		peakId <- with(peakFile, paste(chr,"_",start,"_",end,sep=""))
		row.names(data)=peakId

		getFit(data, idx=c(5,6,7,8), lv=c("WT", "NPM1"), libsize=liblen[c(5,6,7,8)])
		getFit(data, idx=c(5,6,3,4), lv=c("WT", "FLT3"), libsize=liblen[c(5,6,3,4)])
		getFit(data, idx=c(5,6,1,2), lv=c("WT", "DM"), libsize=liblen[c(5,6,1,2)])
		No newline at end of file

Differential_analysis/H3K4me1_differential_analysis.R

deleted100644 → 0

+0 −47

Original line number	Diff line number	Diff line
		### Differential analysis by edgeR

		library(GenomicRanges)
		library(GenomicAlignments)
		library("edgeR")
		library("gplots")

		getFit <- function(data, idx, lv, libsize){
		group <- factor(c(1,1,2,2))
		design <- model.matrix(~group)
		data <- data[,idx, drop=FALSE]
		y <- DGEList(counts=data, group=group,lib.size=libsize)
		keep <- rowSums(cpm(y)>1) >= 2
		y <- y[keep, , keep.lib.sizes=FALSE]
		y <- calcNormFactors(y)
		y <- estimateDisp(y, design)
		y <- estimateGLMCommonDisp(y, design)
		y <- estimateGLMTrendedDisp(y, design)
		y <- estimateGLMTagwiseDisp(y, design)
		fit <- glmFit(y, design)
		lrt <- glmLRT(fit, coef=2)
		DEg <- as.data.frame(topTags(lrt, n=10^6))
		write.table(DEg,file=paste0(lv[1],".",lv[2],".diff_H3K4me1.edgeR.csv"), sep="\t", quote=F, row.names=TRUE)
		}

		peakFile=read.delim("./enhancerPeaks.merged.re.bed", header = TRUE, sep ="\t")
		peakSet=GRanges(seqnames=peakFile$chr, IRanges(peakFile$start,peakFile$end))
		bamFilePath="/serenity/data/Haiyang/ChIP_HY/bamfiles/H3K4me1"
		bamFiles = dir(bamFilePath, pattern="*.bam$", full.name=T)
		aln <- lapply(bamFiles, readGAlignments)
		counts <- lapply(aln, GenomicRanges::countOverlaps, query=peakSet)
		names(counts) <- gsub(".nodup.bam","",basename(bamFiles))

		df=data.frame(as.list(counts),stringsAsFactors=FALSE)
		maxlevel=apply(df, 1, max)
		rawcount=cbind(chr=seqnames(peakSet),start=start(ranges(peakSet)),end=end(ranges(peakSet)),df,Width=width(peakSet), Max=maxlevel)
		liblen=lengths(aln, use.names = TRUE)

		data=df
		peakId <- with(peakFile, paste(chr,"_",start,"_",end,sep=""))
		row.names(data)=peakId

		getFit(data, idx=c(5,6,7,8), lv=c("WT", "NPM1"), libsize=liblen[c(5,6,7,8)])
		getFit(data, idx=c(5,6,3,4), lv=c("WT", "FLT3"), libsize=liblen[c(5,6,3,4)])
		getFit(data, idx=c(5,6,1,2), lv=c("WT", "DM"), libsize=liblen[c(5,6,1,2)])

Differential_analysis/README.md

deleted100644 → 0

+0 −9

Original line number	Diff line number	Diff line
		# Differential_analysis_NGSdata

		Differential analsyses of RNA-seq, ChIP-seq for H3K4me1 or H3k27ac, and ATAC-seq were performed by running edgeR for the comparisons between mutant and wildtype HSPCs.

		RNAseq_differential_analysis.R
		ATACseq_differential_analysis.R
		H3K4me1_differential_analysis.R
		H3K27ac_differential_analysis.R

Differential_analysis/RNAseq_differential_analysis.R

deleted100644 → 0

+0 −80

Original line number	Diff line number	Diff line
		runDESeq<-function(s1,s2){
		library(DESeq2)
		############################################################
		# List the files with read counts generated from HTSeq
		# This will include files for both conditions
		# Each file contains read counts for all the transcripts
		############################################################
		sampleFiles =c(paste0(s1,".EXPR.R1.htseq.counts"),paste0(s1,".EXPR.R2.htseq.counts"),paste0(s2,".EXPR.R1.htseq.counts"),paste0(s2,".EXPR.R2.htseq.counts"))
		print(sampleFiles)

		###########################################################
		# Get sample names from file Names
		############################################################
		sampleNames <- gsub(".htseq.counts","",sampleFiles)
		sampleNames <- gsub(".EXPR","",sampleNames)
		print(sampleNames)
		###########################################################
		# Define sample conditon for each sample in the order
		# as the sample names are listed
		############################################################
		sampleCondition=c( s1,s1,s2,s2)
		print(sampleCondition)
		###########################################################
		# Store all the information in a table
		############################################################
		sampleTable <- data.frame(sampleName = sampleNames,
		fileName = sampleFiles,
		condition = sampleCondition)
		print(sampleTable)
		#########################################################
		# Read the counts for each transcript for all samples
		#########################################################
		ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
		directory = "./HTSeq-Counts",
		design= ~ condition)
		print(ddsHTSeq)
		########################################################
		# Use only those transcripts which have a rowcount >1
		########################################################
		geneFile=read.delim("gencode.vM7.proteinCoding_genes.list", header=F,row.name=1)
		geneList=rownames(geneFile)
		ddsHTSeq <- ddsHTSeq[rownames(ddsHTSeq)%in%geneList, ]
		print(ddsHTSeq)
		genesNameFile=read.csv("./gencode.vM7.geneNames.MOD.txt",sep="\t",header=F, row.name =1)
		geneNames=genesNameFile[1]
		rownames(ddsHTSeq)=geneNames[rownames(ddsHTSeq),]
		########################################################
		# Use only those transcripts which have a rowcount >1
		########################################################
		dds <- ddsHTSeq[ rowSums(counts(ddsHTSeq)) > 1, ]
		print(dds)
		dds$condition <- relevel(dds$condition, ref=s1)
		print(dds$condition)
		#######################################################
		# Differential Expression analysis
		#######################################################
		ana<- DESeq(dds)
		print(ana)
		#######################################################
		# Retrieve results of Differential Expression analysis
		#######################################################
		res <- results(ana)

		#######################################################
		# Summary of results of Differential Expression analysis
		#######################################################
		summary(res)

		##########################################################
		# Write output to a file
		##########################################################
		resOrdered <- res[order(res$log2FoldChange),]

		write.table(as.data.frame(resOrdered),file=paste0(s1,".",s2,".PC.diffExp.csv"), sep="\t", quote=F, row.names=TRUE)

		do.call(runDESeq, list(s1="WT",s2="DM"))
		do.call(runDESeq, list(s1="WT",s2="NPM1"))
		do.call(runDESeq, list(s1="WT",s2="FLT3"))

Admin message