Commit 61b52eba authored by HYsxe's avatar HYsxe
Browse files

Remove BuenRTools dependency

parent c5a53f11
Loading
Loading
Loading
Loading
+0 −5
Original line number Diff line number Diff line
#!/bin/bash
for i in {1..16}
do
   sbatch runTFBS.sh $i
done
+0 −309
Original line number Diff line number Diff line
# If running in Rstudio, set the working directory to current path
if (Sys.getenv("RSTUDIO") == "1"){
  setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
}

source("../../code/utils.R")
source("../../code/getTFBS.R")
library(ComplexHeatmap)
library(BuenColors)
library(circlize)
library(RColorBrewer)
library(ggrepel)

###################
# Load input data #
###################

# Load footprinting project
project <- readRDS("../../data/mHSCAging10xMultiome/project.rds")

regions <- regionRanges(project)

# Load LSI embedding of single cells
cellEmbedding <- readRDS("../../data/mHSCAging10xMultiome/LSIEmbedding.rds")

# Load pseudobulk centers
pseudobulkCenters <- read.table("../../data/mHSCAging10xMultiome/pseudobulkCenters.txt",
                                header = T)
rownames(pseudobulkCenters) <- pseudobulkCenters$group

# Get substructure-by-pseudobulk SummarizedExperiment object
substructurePath <- "../../data/mHSCAging10xMultiome/substructureSE.rds"
if(file.exists(substructurePath)){
  substructureSE <- readRDS("../../data/mHSCAging10xMultiome/substructureSE.rds")
}else{
  substructureSE <- getSubstructureSE(project)
  saveRDS(substructureSE, substructurePath)
}
substructureRanges <- rowRanges(substructureSE)
substructureMat <- assay(substructureSE)

# Load results from differential RNA testing
diffRNA <- read.table("../../data/mHSCAging10xMultiome/diffRNA.tsv")

# Load differential TFBS and CRE results
diffCompare <- read.table("../../data/mHSCAging10xMultiome/diff_CRE_substructure_compare.tsv",
                          sep = "\t", header = T)

# Load single cell ATAC data
scATACSeurat <- readRDS("../../data/mHSCAging10xMultiome/scATACSeurat.rds")

# Load TF motifs
cisBPMotifs <- readRDS("../..//data/shared/cisBP_mouse_pwms_2021.rds")

#########################
# Re-number pseudobulks #
#########################

pseudobulkNames <- groups(project)
pseudobulkCenters <- pseudobulkCenters[pseudobulkNames,]
pseudobulkAges <- sapply(pseudobulkCenters$barcode, function(x){strsplit(x, "-")[[1]][2]})
pseudobulkLSI <- cellEmbedding[pseudobulkCenters$barcode, 2]

# Within each age-group, re-order pseudobulks by LSI-2
reOrder <- c(order(pseudobulkLSI[pseudobulkAges == "Old"]), 
             order(pseudobulkLSI[pseudobulkAges == "Young"]) + sum(pseudobulkAges == "Old"))
substructureMat <- substructureMat[, reOrder]
groupATAC(project) <- groupATAC(project)[, reOrder]
groupRNA(project) <- groupRNA(project)[, reOrder]

# Re-label pseudobulks
colnames(substructureMat) <- pseudobulkNames
colnames(groupATAC(project)) <- pseudobulkNames
colnames(groupRNA(project)) <- pseudobulkNames

#######################################
# Old/Young-like old/Young assignment #
#######################################

seuratClusters <- as.character(scATACSeurat$seurat_clusters)
names(seuratClusters) <- colnames(scATACSeurat)
ageCluster <- seuratClusters[pseudobulkCenters$barcode]
names(ageCluster) <- pseudobulkCenters$group
ageCluster <- ageCluster[pseudobulkNames]
ageCluster <- ageCluster[reOrder]
names(ageCluster) <- pseudobulkNames
ageCluster[ageCluster == "0"] <- "Old"
ageCluster[ageCluster == "6"] <- "Young-like old"
ageCluster[ageCluster == "10"] <- "Young"

################################
# Motif scoring of pseudobulks #
################################

# Filter out sites with low signal
substructureFilter <- rowMaxs(substructureMat) > 0.3
substructureRanges <- substructureRanges[substructureFilter]
substructureMat <- substructureMat[substructureFilter,]

# Create SummarizedExperiment object for differential testing
mode <- "substructure"
if(mode == "substructure"){
  diffInds <- (abs(diffCompare$substructureDiff) > 1) & (abs(diffCompare$CREDiff) < 1)
  diffInds <- diffCompare$substructureInd[diffInds]
  diffSE <- SummarizedExperiment(list(counts = substructureMat[diffInds, ]),
                                 rowRanges = substructureRanges[diffInds])
}else if(mode == "CRE"){
  diffInds <- (abs(diffCompare$substructureDiff) < 1) & (abs(diffCompare$CREDiff) > 1)
  diffInds <- diffCompare$CREInd[diffInds]
  diffSE <- SummarizedExperiment(list(counts = groupATAC(project)[diffInds, ]),
                                 rowRanges = regions[diffInds])
}

# Filter features with zero signal
diffSE <- diffSE[rowSums(assay(diffSE)) > 0, ]

# Add GC bias
diffSE <- chromVAR::addGCBias(diffSE, genome=BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10)

# Remove NA values
bias <- rowRanges(diffSE)$bias
rowRanges(diffSE)$bias[is.na(rowRanges(diffSE)$bias)] <- mean(bias[!is.na(bias)])

# Get background substructures or CREs with matched GC bias and average signal
bgSites <- chromVAR::getBackgroundPeaks(diffSE,niterations = 100) 

# Get pseudobulk-by-TF motif score matrix
motifScores <- pbmcapply::pbmcmapply(
  function(TF){
    
    # Find motif matches
    motifMatches <- motifmatchr::matchMotifs(cisBPMotifs[TF], 
                                             diffSE, 
                                             genome = "mm10")
    
    # Compute motif scores (deviation score from background)
    deviation <- chromVAR::computeDeviations(object = diffSE, 
                                             annotations = motifMatches, 
                                             background_peaks = bgSites)
    deviation <- assay(deviation)
  },
  names(cisBPMotifs),
  mc.cores = 8
)
rownames(motifScores) <- colnames(diffSE)
saveRDS(motifScores, paste0("../../data/mHSCAging10xMultiome/", mode, "MotifScores.rds"))

# Differential testing comparing motifs scores of groupA and groupB
groupsA <- which(stringr::str_detect(rownames(motifScores), "Young"))
groupsB <- which(stringr::str_detect(rownames(motifScores), "Old"))
diffMotifScore <- pbmcapply::pbmclapply(
  names(cisBPMotifs),
  function(TF){
    test <- t.test(motifScores[groupsA, TF], motifScores[groupsB, TF])
    data.frame(pval = test$p.value, diffMean = mean(motifScores[groupsB, TF]) - mean(motifScores[groupsA, TF]))
  }
)
diffMotifScore <- as.data.frame(data.table::rbindlist(diffMotifScore))
rownames(diffMotifScore) <- names(cisBPMotifs)
diffMotifScore$TF <- names(cisBPMotifs)
diffMotifScore$logP <- -log10(diffMotifScore$pval)
diffMotifScore$fdr <- p.adjust(diffMotifScore$pval, method = "fdr")
diffMotifScore$signedLogFDR <- -log10(diffMotifScore$fdr) * sign(diffMotifScore$diffMean)
diffMotifScore <- diffMotifScore[order(diffMotifScore$signedLogFDR, decreasing = T),]
saveRDS(diffMotifScore, paste0("../../data/mHSCAging10xMultiome/",
                               mode, "DiffMotifScores.rds"))

######################################################
# Comapre diff motif scores with diff TF RNA results #
######################################################

ovTFs <- intersect(rownames(diffMotifScore), rownames(diffRNA))
diffMotifScore <- diffMotifScore[ovTFs, ]
diffTFRNA <- diffRNA[ovTFs, ]
diffMotifScore$FDR <- p.adjust(diffMotifScore$pval, method = "fdr")
diffTFRNA$FDR <- p.adjust(diffTFRNA$pvalue, method = "fdr")
diffMotifScore$signedLogFDR <- -log10(diffMotifScore$FDR) * sign(diffMotifScore$diffMean)
diffTFRNA$signedLogFDR <- -log10(diffTFRNA$FDR) * sign(diffTFRNA$log2FoldChange)

plotData <- data.frame(
  diffMotifScore = diffMotifScore$signedLogFDR,
  diffTFRNA = diffTFRNA$signedLogFDR,
  TF = ovTFs
)
selected <- ((plotData$diffMotifScore > 1) & (plotData$diffTFRNA > 10)) |
  ((plotData$diffMotifScore < -1) & (plotData$diffTFRNA < -10))
rownames(plotData) <- ovTFs
labels <- rep("", dim(plotData)[1])
labels[selected] <- plotData$TF[selected]
pdf(paste0("../../data/mHSCAging10xMultiome/plots/Old_HSC_vs_Young_HSC_diff_", 
           mode, "_motif_scores_RNA.pdf"),
    width = 7, height = 6)
ggplot(plotData) +
  geom_point(aes(x = diffMotifScore, y = diffTFRNA), color = "grey") +
  geom_point(data = plotData[selected,],
             aes(x = diffMotifScore, y = diffTFRNA), color = "red") +
  geom_text_repel(x = plotData$diffMotifScore, y = plotData$diffTFRNA, 
                  label = labels, max.overlaps = 1e5,
                  force = 1) +
  xlab("Differential motif score (Old - Young)\nsigned -log10(FDR)") +
  ylab("Differential RNA (Old - Young)\nsigned -log10(FDR)") +
  theme_classic()
dev.off()

######################################################
# Comapre diff motif scores with diff TF RNA results #
######################################################

diffCREMotifScores <- readRDS("../../data/mHSCAging10xMultiome/CREDiffMotifScores.rds")
diffSubstructureMotifScores <- readRDS("../../data/mHSCAging10xMultiome/substructureDiffMotifScores.rds")
ovTFs <- intersect(rownames(diffCREMotifScores), intersect(rownames(diffCREMotifScores), rownames(diffRNA)))
diffTFRNA <- diffRNA[ovTFs, ]
ovTFs <- ovTFs[diffTFRNA$padj < 0.01]
diffCREMotifScores <- diffCREMotifScores[ovTFs, ]
diffSubstructureMotifScores <- diffSubstructureMotifScores[ovTFs, ]

diffRNASigndLogFDR <- sign(diffTFRNA[ovTFs, ]$log2FoldChange) * -log10(diffTFRNA[ovTFs, ]$padj)
diffRNASigndLogFDR <- pmin(pmax(diffRNASigndLogFDR, -5), 5)
plotData <- data.frame(diffSubstructure = diffSubstructureMotifScores$signedLogFDR,
                       diffCRE = diffCREMotifScores$signedLogFDR,
                       TF = ovTFs,
                       diffRNA = diffRNASigndLogFDR)
rownames(plotData) <- ovTFs

# Label substructure-specific TFs
selected <- ((abs(diffSubstructureMotifScores$signedLogFDR) > 2) & (abs(diffCREMotifScores$signedLogFDR) < 1)) 
labelsA <- rep("", dim(plotData)[1])
labelsA[selected] <- plotData$TF[selected]

# Label shared TFs
selected <- ((abs(diffSubstructureMotifScores$signedLogFDR) > 2) & 
             (abs(diffCREMotifScores$signedLogFDR) > 2) &
               (diffCREMotifScores$signedLogFDR * diffSubstructureMotifScores$signedLogFDR > 0)) 
rownames(plotData) <- ovTFs
labelsB <- rep("", dim(plotData)[1])
labelsB[selected] <- plotData$TF[selected]

pdf("../../data/mHSCAging10xMultiome/plots/CRE_subtructure_diff_motif_score_compare.pdf",
    width = 7, height = 6)
ggplot(plotData) +
  geom_point(aes(x = diffCRE, y = diffSubstructure, color = diffRNA)) +
  xlim(-10,10) + ylim(-10,10) + 
  scale_color_gradientn(colors = jdb_palette("solar_extra")) +
  xlab("Diff CRE motif scores signed log10(FDR)") +
  ylab("Diff substructure motif scores signed log10(FDR)") +
  geom_text_repel(x = plotData$diffCRE, y = plotData$diffSubstructure, 
                  label = labelsA, max.overlaps = 1e5,
                  force = 1, color = "Red") +
  geom_text_repel(x = plotData$diffCRE, y = plotData$diffSubstructure, 
                  label = labelsB, max.overlaps = 1e5,
                  force = 1, color = "Black") +
  geom_vline(xintercept = 1, linetype ="dashed", 
             color = "black", size = 0.5) +
  geom_vline(xintercept = -1, linetype ="dashed", 
             color = "black", size = 0.5) +
  geom_hline(yintercept = 1, linetype ="dashed", 
             color = "black", size = 0.5) +
  geom_hline(yintercept = -1, linetype ="dashed", 
             color = "black", size = 0.5) +
  theme_classic()
dev.off()

###########################################
# Plot heatmap of TF motif scores and RNA #
###########################################

plotTFs <- "Specific"

if(plotTFs == "Shared"){
  # Select shared TFs
  selected <- ((abs(diffSubstructureMotifScores$signedLogFDR) > 2) & 
                 (abs(diffCREMotifScores$signedLogFDR) > 2) &
                 (diffCREMotifScores$signedLogFDR * diffSubstructureMotifScores$signedLogFDR > 0) &
                 (diffTFRNA[ovTFs, ]$log2FoldChange * diffSubstructureMotifScores$signedLogFDR > 0)) 
  
}else if(plotTFs == "Specific"){
  # Select substructure-specific TFs
  selected <- ((abs(diffSubstructureMotifScores$signedLogFDR) > 2) & (abs(diffCREMotifScores$signedLogFDR) < 1)) 
  
}

# Visualize motif scores per pseudobulk
pltMatrix <- motifScores[,plotData$TF[selected]]
pltMatrix <- t(pltMatrix)
pltMatrix <- t((pltMatrix - rowMeans(pltMatrix)) / rowSds(pltMatrix))
TFOrder <- rev(hclust(distCosine(t(motifScores[,plotData$TF[selected]])))$order)
pdf(paste0("../../data/mHSCAging10xMultiome/plots/", mode, plotTFs, "PseudobulkMotifScores.pdf"),
    width = 7, height = 6)
Heatmap(pltMatrix[, TFOrder],
        cluster_rows = F,
        cluster_columns = F,
        name = "Motif\nscore",
        row_split = factor(ageCluster, levels = c("Old", "Young-like old", "Young")),
        col = jdb_palette(n = 9, "solar_extra")[1:9])
dev.off()

# Visualize RNA of the corresponding TFs per pseudobulk
pltMatrix <- t(groupRNA(project)[plotData$TF[selected], ])
pltMatrix <- t(pltMatrix)
pltMatrix <- t((pltMatrix - rowMeans(pltMatrix)) / rowSds(pltMatrix))
pdf("../../data/mHSCAging10xMultiome/plots/pseudobulkTFRNA.pdf",
    width = 7, height = 6)
Heatmap(pltMatrix[, TFOrder],
        cluster_rows = F,
        cluster_columns = F,
        row_split = factor(ageCluster, levels = c("Old", "Young-like old", "Young")),
        name = "RNA",
        col = jdb_palette(n = 9, "solar_extra")[1:9])
dev.off()
+0 −81
Original line number Diff line number Diff line
# If running in Rstudio, set the working directory to current path
if (Sys.getenv("RSTUDIO") == "1"){
  setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
}

source("../../code/utils.R")
source("../../code/getGroupData.R")

###################
# Load input data #
###################

# Load footprinting project
project <- readRDS("../../data/mHSCAging10xMultiome/project.rds")

# Load LSI embedding of single cells
cellEmbedding <- readRDS("../../data/mHSCAging10xMultiome/LSIEmbedding.rds")

# Load barcodes for each pseudobulk
barcodeGrouping <- read.table("../../data/mHSCAging10xMultiome/barcodeGrouping.txt",
                              header = T)

# Load pseudobulk centers
pseudobulkCenters <- read.table("../../data/mHSCAging10xMultiome/pseudobulkCenters.txt",
                                header = T)
rownames(pseudobulkCenters) <- pseudobulkCenters$group

# Load single cell RNA data
scRNA <- readRDS("../../data/mHSCAging10xMultiome/scRNA.rds")

# Load barcodes for each pseudo-bulk
barcodeGroups <- read.table("../../data/mHSCAging10xMultiome/barcodeGrouping.txt", header = T)
barcodeGrouping(project) <- barcodeGroups
groups(project) <- gtools::mixedsort(unique(barcodeGroups$group))

#########################
# Re-number pseudobulks #
#########################

pseudobulkNames <- groups(project)
pseudobulkCenters <- pseudobulkCenters[pseudobulkNames,]
pseudobulkAges <- sapply(pseudobulkCenters$barcode, function(x){strsplit(x, "-")[[1]][2]})
pseudobulkLSI <- cellEmbedding[pseudobulkCenters$barcode, 2]

# Within each age-group, re-order pseudobulks by their LSI-2 coordinate
# We don't use the first LSI because it highly correlates with depth
reOrder <- c(order(pseudobulkLSI[pseudobulkAges == "Old"]), 
             order(pseudobulkLSI[pseudobulkAges == "Young"]) + sum(pseudobulkAges == "Old"))

#############################
# Differential RNA analysis #
#############################

RNAMatrix <- scRNA@assays$RNA@data
pseudobulkRNA <- getGroupRNA(RNAMatrix, barcodeGroups)
pseudobulkRNA <- pseudobulkRNA[, reOrder]
colnames(pseudobulkRNA) <- pseudobulkNames

# Get pseudobulk metadata
metadata <- t(sapply(pseudobulkNames, 
                     function(x){
                       strsplit(x, "_")[[1]][c(1,2)]
                     }))
metadata <- as.data.frame(metadata)
colnames(metadata) <- c("Age", "pbulk_ind")

# Generate DDS object
dds <- DESeq2::DESeqDataSetFromMatrix(
  countData = pseudobulkRNA,
  colData = metadata,
  design = ~ Age)

# Model fitting
dds <- DESeq2::DESeq(dds)

# Retrieve differential analysis results
diffRNA <- DESeq2::results(dds, contrast = c("Age","Old","Young"))
diffRNA <- diffRNA[!is.na(diffRNA$padj),]
diffRNA <- diffRNA[order(diffRNA$pvalue),]
write.table(diffRNA, "../../data/mHSCAging10xMultiome/diffRNA.tsv",
            sep = "\t", quote = F)
+0 −267

File deleted.

Preview size limit exceeded, changes collapsed.

+0 −144
Original line number Diff line number Diff line
# If running in Rstudio, set the working directory to current path
if (Sys.getenv("RSTUDIO") == "1"){
  setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
}

source("../../code/utils.R")
source("../../code/getCounts.R")
source("../../code/getBias.R")
source("../../code/getGroupData.R")
source("../../code/getFootprints.R")
source("../../code/visualization.R")
source("../../code/getTFBS.R")

# Initialize a footprintingProject object
projectName <- "mHSCAging10xMultiome/"
project <- footprintingProject(projectName = projectName, 
                               refGenome = "mm10")
projectMainDir <- "../../"
projectDataDir <- paste0(projectMainDir, "data/", projectName)
dataDir(project) <- projectDataDir
mainDir(project) <- projectMainDir

# Load region ranges
regions <- readRDS(paste0(projectMainDir, "data/mHSCAging10xMultiome/regionRanges.rds"))

# Set the regionRanges slot
regionRanges(project) <- regions

# Use our neural network to predict Tn5 bias for all positions in all regions
# Remember to make a copy of the Tn5_NN_model.h5 file in projectDataDir!!
biasPath <- paste0(projectMainDir, "data/mHSCAging10xMultiome/predBias.rds")
if(file.exists(biasPath)){
  regionBias(project) <- readRDS(biasPath)
}else{
  project <- getRegionBias(project, nCores = 24)
  saveRDS(regionBias(project), biasPath)
}

# Load barcodes for each pseudo-bulk
pathToFragGrouping <- paste0(projectDataDir, "barcodeGrouping.txt")
barcodeGroups <- read.table(pathToFragGrouping, header = T)
barcodeGrouping(project) <- barcodeGroups
groups(project) <- mixedsort(unique(barcodeGroups$group))

# Getting the pseudobulk-by-region-by-position counts tensor from a fragment file
pathToFrags <- paste0(projectMainDir, "data/mHSCAging10xMultiome/all.frags.filt.tsv.gz")
getCountTensor(project, pathToFrags, barcodeGroups, returnCombined = F)

# Get gene-by-pseudobulk RNA matrix, followed by LogNormalization
pseudobulkRNAPath <- "../../data/mHSCAging10xMultiome/pseudobulkRNA.rds"
if(!file.exists(pseudobulkRNAPath)){
  scRNA <- readRDS("../../data/mHSCAging10xMultiome/scRNA.rds")
  RNAMatrix <- scRNA@assays$RNA@data
  pseudobulkRNA <- getGroupRNA(RNAMatrix, barcodeGroups)
  pseudobulkRNA <- preprocessCore::normalize.quantiles(pseudobulkRNA)
  rownames(pseudobulkRNA) <- rownames(RNAMatrix)
  groupRNA(project) <- pseudobulkRNA
  saveRDS(pseudobulkRNA, pseudobulkRNAPath)
}else{
  groupRNA(project) <- readRDS(pseudobulkRNAPath)
}

# Get region-by-pseudobulk ATAC matrix 
pseudobulkATACPath <- paste0(projectDataDir, "pseudobulkATAC.rds")
if(!file.exists(pseudobulkATACPath)){
  groupATAC(project) <- getGroupATAC(project)
  groupATAC(project) <- centerCounts(groupATAC(project))
  saveRDS(groupATAC(project), pseudobulkATACPath)
}else{
  groupATAC(project) <- readRDS(pseudobulkATACPath)
}

# Load Tn5 background dispersion model
for(kernelSize in 2:100){
  dispModel(project, as.character(kernelSize)) <- 
    readRDS(paste0("../../data/shared/dispModel/dispersionModel", kernelSize, "bp.rds"))
}

# Load TFBS prediction model
h5Path <- "../../data/TFBSPrediction/TFBS_model.h5"
TFBSModel <- keras::load_model_hdf5(h5Path)
TFBSModel <- keras::get_weights(TFBSModel)
h5file <- H5File$new(h5Path, mode="r")
footprintMean <- h5file[["footprint_mean"]]
footprintMean <- footprintMean[1:footprintMean$dims]
footprintSd <- h5file[["footprint_sd"]]
footprintSd <- footprintSd[1:footprintSd$dims]
h5file$close_all()
TFBSModel$footprintMean <- footprintMean
TFBSModel$footprintSd <- footprintSd
TFBSModel$scales <- c(10,20,30,50,80,100)
TFBindingModel(project) <- TFBSModel

# Save the project object with pre-loaded slots
saveRDS(project, "../../data/mHSCAging10xMultiome/project.rds")

# Specify which programs to run
args <- commandArgs(trailingOnly=TRUE)
options <- rep(F, 2)
options[as.integer(args[1])] <- T
runTFBS <- options[1]
runFootprinting <- options[2]

# Run footprinting
if(runFootprinting){
  
  # Set footprint radius
  if(is.na(args[2])){
    footprintRadius <- 20
  }else{
    footprintRadius <- as.integer(args[2])
  }
  
  project <- getFootprints(
    project, 
    mode = as.character(footprintRadius),
    nCores = 16, 
    footprintRadius = footprintRadius,
    flankRadius = footprintRadius)
  
  system(paste0("mkdir ../../data/", projectName, "/footprints/"))
  saveRDS(footprints(project, as.character(footprintRadius)), 
          paste0(projectDataDir, "footprints/", footprintRadius, "bp_footprints.rds"))
}

# Run TFBS detection
if(runTFBS){
  
  # Get argument from command line
  if(is.na(args[2])){
    nChunks <- length(getChunkInterval(regions, regionChunkSize(project))$ends)
    chunkInds <- 1:nChunks
  }else{
    chunkInds <- 10 * (as.integer(args[2]) - 1) + 1:10
  }
  print(paste("Processing chunks", chunkInds[1], "to", chunkInds[length(chunkInds)]))
  
  project <- getTFBS(project,
                     chunkInds = chunkInds,
                     nCores = 12)
  
}

Loading