Unverified Commit 2e64d2f2 authored by hy298's avatar hy298 Committed by GitHub
Browse files

Delete Differential_analysis directory

parent 119e5083
Loading
Loading
Loading
Loading
+0 −46
Original line number Diff line number Diff line
### Differential analysis by edgeR

library(GenomicRanges)
library(GenomicAlignments)
library("edgeR")
library("gplots")

getFit <- function(data, idx, lv, libsize){
  group <- factor(c(1,1,2,2))
  design <- model.matrix(~group)
  data <- data[,idx, drop=FALSE]
  y <- DGEList(counts=data, group=group,lib.size=libsize)
  keep <- rowSums(cpm(y)>1) >= 2
  y <- y[keep, , keep.lib.sizes=FALSE]
  y <- calcNormFactors(y)
  y <- estimateDisp(y, design)
  y <- estimateGLMCommonDisp(y, design)
  y <- estimateGLMTrendedDisp(y, design)
  y <- estimateGLMTagwiseDisp(y, design)
  fit <- glmFit(y, design)
  lrt <- glmLRT(fit, coef=2)
  DEg <- as.data.frame(topTags(lrt, n=10^6))
  write.table(DEg,file=paste0(lv[1],".",lv[2],".diff_ATAC.edgeR.csv"), sep="\t", quote=F, row.names=TRUE)
}

peakFile=read.delim("./ATAC_consensus_peaks_filter.bed", header = TRUE, sep ="\t")
peakSet=GRanges(seqnames=peakFile$chr, IRanges(peakFile$start,peakFile$end))
bamFilePath="/serenity/data/Haiyang/ATAC_HY/promoter/bamfiles/"
bamFiles = dir(bamFilePath, pattern="*.bam$", full.name=T)
aln <- lapply(bamFiles, readGAlignments)
counts <- lapply(aln, GenomicRanges::countOverlaps, query=peakSet)
names(counts) <- gsub(".nodup.bam","",basename(bamFiles))

df=data.frame(as.list(counts),stringsAsFactors=FALSE)
maxlevel=apply(df, 1, max)
rawcount=cbind(chr=seqnames(peakSet),start=start(ranges(peakSet)),end=end(ranges(peakSet)),df,Width=width(peakSet), Max=maxlevel)

liblen=lengths(aln, use.names = TRUE)
data=df
peakId <- with(peakFile, paste(chr,"_",start,"_",end,sep=""))
row.names(data)=peakId

getFit(data, idx=c(5,6,7,8), lv=c("WT", "NPM1"), libsize=liblen[c(5,6,7,8)])
getFit(data, idx=c(5,6,3,4), lv=c("WT", "FLT3"), libsize=liblen[c(5,6,3,4)])
getFit(data, idx=c(5,6,1,2), lv=c("WT", "DM"), libsize=liblen[c(5,6,1,2)])
+0 −45
Original line number Diff line number Diff line
### Differential analysis by edgeR

library(GenomicRanges)
library(GenomicAlignments)
library("edgeR")
library("gplots")

getFit <- function(data, idx, lv, libsize){
  group <- factor(c(1,1,2,2))
  design <- model.matrix(~group)
  data <- data[,idx, drop=FALSE]
  y <- DGEList(counts=data, group=group,lib.size=libsize)
  keep <- rowSums(cpm(y)>1) >= 2
  y <- y[keep, , keep.lib.sizes=FALSE]
  y <- calcNormFactors(y)
  y <- estimateDisp(y, design)
  y <- estimateGLMCommonDisp(y, design)
  y <- estimateGLMTrendedDisp(y, design)
  y <- estimateGLMTagwiseDisp(y, design)
  fit <- glmFit(y, design)
  lrt <- glmLRT(fit, coef=2)
  DEg <- as.data.frame(topTags(lrt, n=10^6))
  write.table(DEg,file=paste0(lv[1],".",lv[2],".diff_H3K27ac.edgeR.csv"), sep="\t", quote=F, row.names=TRUE)
}

peakFile=read.delim("./H3K27ac_consensus_peaks.bed", header = TRUE, sep ="\t")
peakSet=GRanges(seqnames=peakFile$chr, IRanges(peakFile$start,peakFile$end))
bamFilePath="/serenity/data/Haiyang/ChIP_HY/bamfiles/H3K27ac"
bamFiles = dir(bamFilePath, pattern="*.bam$", full.name=T)
aln <- lapply(bamFiles, readGAlignments)
counts <- lapply(aln, GenomicRanges::countOverlaps, query=peakSet)
names(counts) <- gsub(".nodup.bam","",basename(bamFiles))

df=data.frame(as.list(counts),stringsAsFactors=FALSE)
maxlevel=apply(df, 1, max)
rawcount=cbind(chr=seqnames(peakSet),start=start(ranges(peakSet)),end=end(ranges(peakSet)),df,Width=width(peakSet), Max=maxlevel)

liblen=lengths(aln, use.names = TRUE)
data=df
peakId <- with(peakFile, paste(chr,"_",start,"_",end,sep=""))
row.names(data)=peakId

getFit(data, idx=c(5,6,7,8), lv=c("WT", "NPM1"), libsize=liblen[c(5,6,7,8)])
getFit(data, idx=c(5,6,3,4), lv=c("WT", "FLT3"), libsize=liblen[c(5,6,3,4)])
getFit(data, idx=c(5,6,1,2), lv=c("WT", "DM"), libsize=liblen[c(5,6,1,2)])
 No newline at end of file
+0 −47
Original line number Diff line number Diff line
### Differential analysis by edgeR

library(GenomicRanges)
library(GenomicAlignments)
library("edgeR")
library("gplots")

getFit <- function(data, idx, lv, libsize){
  group <- factor(c(1,1,2,2))
  design <- model.matrix(~group)
  data <- data[,idx, drop=FALSE]
  y <- DGEList(counts=data, group=group,lib.size=libsize)
  keep <- rowSums(cpm(y)>1) >= 2
  y <- y[keep, , keep.lib.sizes=FALSE]
  y <- calcNormFactors(y)
  y <- estimateDisp(y, design)
  y <- estimateGLMCommonDisp(y, design)
  y <- estimateGLMTrendedDisp(y, design)
  y <- estimateGLMTagwiseDisp(y, design)
  fit <- glmFit(y, design)
  lrt <- glmLRT(fit, coef=2)
  DEg <- as.data.frame(topTags(lrt, n=10^6))
  write.table(DEg,file=paste0(lv[1],".",lv[2],".diff_H3K4me1.edgeR.csv"), sep="\t", quote=F, row.names=TRUE)
}

peakFile=read.delim("./enhancerPeaks.merged.re.bed", header = TRUE, sep ="\t")
peakSet=GRanges(seqnames=peakFile$chr, IRanges(peakFile$start,peakFile$end))
bamFilePath="/serenity/data/Haiyang/ChIP_HY/bamfiles/H3K4me1"
bamFiles = dir(bamFilePath, pattern="*.bam$", full.name=T)
aln <- lapply(bamFiles, readGAlignments)
counts <- lapply(aln, GenomicRanges::countOverlaps, query=peakSet)
names(counts) <- gsub(".nodup.bam","",basename(bamFiles))

df=data.frame(as.list(counts),stringsAsFactors=FALSE)
maxlevel=apply(df, 1, max)
rawcount=cbind(chr=seqnames(peakSet),start=start(ranges(peakSet)),end=end(ranges(peakSet)),df,Width=width(peakSet), Max=maxlevel)
liblen=lengths(aln, use.names = TRUE)

data=df
peakId <- with(peakFile, paste(chr,"_",start,"_",end,sep=""))
row.names(data)=peakId

getFit(data, idx=c(5,6,7,8), lv=c("WT", "NPM1"), libsize=liblen[c(5,6,7,8)])
getFit(data, idx=c(5,6,3,4), lv=c("WT", "FLT3"), libsize=liblen[c(5,6,3,4)])
getFit(data, idx=c(5,6,1,2), lv=c("WT", "DM"), libsize=liblen[c(5,6,1,2)])

Differential_analysis/README.md

deleted100644 → 0
+0 −9
Original line number Diff line number Diff line
# Differential_analysis_NGSdata

Differential analsyses of RNA-seq, ChIP-seq for H3K4me1 or H3k27ac, and ATAC-seq were performed by running edgeR for the comparisons between mutant and wildtype HSPCs.

RNAseq_differential_analysis.R
ATACseq_differential_analysis.R
H3K4me1_differential_analysis.R
H3K27ac_differential_analysis.R
+0 −80
Original line number Diff line number Diff line
runDESeq<-function(s1,s2){
library(DESeq2)
############################################################
# List the files with read counts generated from HTSeq 
# This will include files for both conditions
# Each file contains read counts for all the transcripts
############################################################
sampleFiles =c(paste0(s1,".EXPR.R1.htseq.counts"),paste0(s1,".EXPR.R2.htseq.counts"),paste0(s2,".EXPR.R1.htseq.counts"),paste0(s2,".EXPR.R2.htseq.counts"))
print(sampleFiles)

###########################################################
# Get sample names from file Names
############################################################
sampleNames <- gsub(".htseq.counts","",sampleFiles)
sampleNames <- gsub(".EXPR","",sampleNames)
print(sampleNames)
###########################################################
# Define sample conditon for each sample in the order
# as the sample names are listed
############################################################
sampleCondition=c( s1,s1,s2,s2)
print(sampleCondition)
###########################################################
# Store all the information in a table
############################################################
sampleTable <- data.frame(sampleName = sampleNames,
                          fileName = sampleFiles,
                          condition = sampleCondition)
print(sampleTable)
#########################################################
# Read the counts for each transcript for all samples
#########################################################
ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
                                       directory = "./HTSeq-Counts",
                                       design= ~ condition)
print(ddsHTSeq)
########################################################
# Use only those transcripts which have a rowcount >1 
########################################################
geneFile=read.delim("gencode.vM7.proteinCoding_genes.list", header=F,row.name=1)
geneList=rownames(geneFile)
ddsHTSeq <- ddsHTSeq[rownames(ddsHTSeq)%in%geneList, ]
print(ddsHTSeq)
genesNameFile=read.csv("./gencode.vM7.geneNames.MOD.txt",sep="\t",header=F, row.name =1)
geneNames=genesNameFile[1]
rownames(ddsHTSeq)=geneNames[rownames(ddsHTSeq),]
########################################################
# Use only those transcripts which have a rowcount >1 
########################################################
dds <- ddsHTSeq[ rowSums(counts(ddsHTSeq)) > 1, ]
print(dds)
dds$condition <- relevel(dds$condition, ref=s1)
print(dds$condition)
#######################################################
# Differential Expression analysis
#######################################################
ana<- DESeq(dds)
print(ana)
#######################################################
# Retrieve results of Differential Expression analysis
#######################################################
res <- results(ana)

#######################################################
# Summary of results of Differential Expression analysis
#######################################################
summary(res)

##########################################################
# Write output to a file
##########################################################
resOrdered <- res[order(res$log2FoldChange),]

write.table(as.data.frame(resOrdered),file=paste0(s1,".",s2,".PC.diffExp.csv"), sep="\t", quote=F, row.names=TRUE)

do.call(runDESeq, list(s1="WT",s2="DM"))
do.call(runDESeq, list(s1="WT",s2="NPM1"))
do.call(runDESeq, list(s1="WT",s2="FLT3"))