Commit a5ee9b11 authored by Chaos's avatar Chaos
Browse files

code clean

parent beddcca7
Loading
Loading
Loading
Loading
+535 −487
Original line number Diff line number Diff line
# scATAC data analysis pipeline, mainly with ArchR, v1.0.1
library(ArchR) # version 
# scATAC data analysis pipeline, mainly with ArchR
# you need to install Seural package 
pacman::p_load(ArchR,parallel)
set.seed(1)
addArchRThreads(threads = 16) 
addArchRGenome("hg38")

# input data from 10x cellranger-atac output
inputFiles <- c(
"scA/fragments.tsv.gz","scB/fragments.tsv.gz","scC/fragments.tsv.gz","scD/fragments.tsv.gz","scE/fragments.tsv.gz","scF/fragments.tsv.gz","scG/fragments.tsv.gz","scH/fragments.tsv.gz","scI/fragments.tsv.gz","scJ/fragments.tsv.gz","scK/fragments.tsv.gz","scL/fragments.tsv.gz","scM/fragments.tsv.gz","scN/fragments.tsv.gz","scO/fragments.tsv.gz","scP/fragments.tsv.gz","scQ/fragments.tsv.gz","scR/fragments.tsv.gz","scS/fragments.tsv.gz","scT/fragments.tsv.gz","scU/fragments.tsv.gz","scV/fragments.tsv.gz","scW/fragments.tsv.gz","scX/fragments.tsv.gz","scY/fragments.tsv.gz","scZ/fragments.tsv.gz","scAA/fragments.tsv.gz","scAB/fragments.tsv.gz","scAC/fragments.tsv.gz","scAD/fragments.tsv.gz","scAE/fragments.tsv.gz","scAF/fragments.tsv.gz","scAG/fragments.tsv.gz","scAH/fragments.tsv.gz","scAI/fragments.tsv.gz","scAJ/fragments.tsv.gz","scAK/fragments.tsv.gz","scAL/fragments.tsv.gz","scAM/fragments.tsv.gz","scAN/fragments.tsv.gz","scAO/fragments.tsv.gz","scAP/fragments.tsv.gz","scAQ/fragments.tsv.gz","scAR/fragments.tsv.gz")
names(inputFiles)<-c("scA","scB","scC","scD","scE","scF","scG","scH","scI","scJ","scK","scL","scM","scN","scO","scP","scQ","scR","scS","scT","scU","scV","scW","scX","scY","scZ","scAA","scAB","scAC","scAD","scAE","scAF","scAG","scAH","scAI","scAJ","scAK","scAL","scAM","scAN","scAO","scAP","scAQ","scAR")
result_dir <- "scATAC_analysis_result"
if(!dir.exists(result_dir)){dir.create(result_dir)}

id <- paste0("sc",c("AA","AB","AC","AH","AI"))
fragment_file <- paste0(id,"/outs/fragments.tsv.gz")

# create ArchR object
ArrowFiles <-	createArrowFiles(
  inputFiles = inputFiles,
  sampleNames = names(inputFiles),
  filterTSS = 4, #Dont set this too high because you can always increase later
  filterFrags = 1000, 
					inputFiles = fragment_file,
					sampleNames = id,
					minTSS = 4, #Dont set this too high because you can always increase later
					minFrags = 1000, 
					addTileMat = TRUE,
					force = FALSE,
					addGeneScoreMat = TRUE
				)
projCAD1 <- ArchRProject(

proj_CAD <-	ArchRProject(
				ArrowFiles = ArrowFiles, 
				outputDirectory = "CAD",
				copyArrows = TRUE #This is recommened so that if you modify the Arrow files you have an original copy for later usage.
@@ -33,8 +36,10 @@ doubScores <- addDoubletScores(
					force=TRUE
				)

proj_CAD_1 <- proj_CAD

# basic QC 
proj_CAD_1 <- projCAD1
df <- getCellColData(proj_CAD_1, select = c("log10(nFrags)", "TSSEnrichment"))
p <-	ggPoint(
			x = df[,1], 
			y = df[,2], 
@@ -44,8 +49,11 @@ p <- ggPoint(
			ylabel = "TSS Enrichment",
			xlim = c(log10(500), quantile(df[,1], probs = 0.99)),
			ylim = c(0, quantile(df[,2], probs = 0.99))
) + geom_hline(yintercept = 4, lty = "dashed") + geom_vline(xintercept = 3, lty = "dashed")
plotPDF(p, name = "TSS-vs-Frags.pdf", ArchRProj = proj_CAD_1, addDOC = FALSE)
		) +
		geom_hline(yintercept = 4, lty = "dashed") +
		geom_vline(xintercept = 3, lty = "dashed")

plotPDF(p,name = "1.TSS-vs-Frags.pdf", ArchRProj = proj_CAD_1, addDOC = FALSE)

p1 <-	plotGroups(
			ArchRProj = proj_CAD_1, 
@@ -79,11 +87,18 @@ p4 <- plotGroups(
			alpha = 0.4,
			addBoxPlot = TRUE
		)
plotPDF(p1,p2,p3,p4, name = "QC-Sample-Statistics.pdf", ArchRProj = proj_CAD_1, addDOC = FALSE, width = 4, height = 4)
plotPDF(p1,p2,p3,p4, name = "2.QC-Sample-Statistics.pdf", ArchRProj = proj_CAD_1, addDOC = FALSE, width = 4, height = 4)

p1 <- plotFragmentSizes(ArchRProj = proj_CAD_1)
p2 <- plotTSSEnrichment(ArchRProj = proj_CAD_1)
plotPDF(p1,p2, name = "QC-Sample-FragSizes-TSSProfile.pdf", ArchRProj = proj_CAD_1, addDOC = FALSE, width = 5, height = 5)
plotPDF(p1,p2, name = "3.QC-Sample-FragSizes-TSSProfile.pdf", ArchRProj = proj_CAD_1, addDOC = FALSE, width = 5, height = 5)

# filter cells
proj_CAD_2 <- filterDoublets(proj_CAD,filterRatio=1.5)
df2 <- getCellColData(proj_CAD_2,select = c("log10(nFrags)", "TSSEnrichment"))
idxPass <- which(proj_CAD_2$TSSEnrichment >= 7 & proj_CAD_2$nFrags >= 10000)
cellsPass <- proj_CAD_2$cellNames[idxPass]
proj_CAD_2 <- proj_CAD_2[cellsPass, ]

p <-	ggPoint(
			x = df2[,"log10(nFrags)"], 
@@ -94,24 +109,19 @@ p <- ggPoint(
			ylabel = "TSS Enrichment",
			xlim = c(3, quantile(df2[,"log10(nFrags)"], probs = 0.99)),
			ylim = c(4, quantile(df2[,"TSSEnrichment"], probs = 0.99))
) + geom_hline(yintercept = 7, lty = "dashed") + geom_vline(xintercept = 4, lty = "dashed")
plotPDF(p, name = "TSS-vs-Frags_cutoff.pdf", ArchRProj = proj_CAD_2, addDOC = FALSE)


# filter cells
idxPass <- which(proj_CAD_2$TSSEnrichment >= 7 & proj_CAD_2$nFrags >= 10000)
proj_CAD_2 <- filterDoublets(proj_CAD_1,filterRatio=1.5)
df2 <- getCellColData(proj_CAD_2,select = c("log10(nFrags)", "TSSEnrichment"))
cellsPass <- proj_CAD_2$cellNames[idxPass]
proj_CAD_2 <- proj_CAD_2[cellsPass, ]
		) +
		geom_hline(yintercept = 7, lty = "dashed") +
		geom_vline(xintercept = 4, lty = "dashed")
plotPDF(p,name = "4.TSS-vs-Frags_cutoff.pdf", ArchRProj = proj_CAD_2, addDOC = FALSE)

# dimensional reduction
proj_CAD_2 <- addIterativeLSI(
    ArchRProj = proj_CAD_2,

proj_CAD_2 <-	proj_CAD_2 |>
				addIterativeLSI(
					useMatrix = "TileMatrix", 
					name = "IterativeLSI", 
					iterations = 2, 
    clusterParams = list( #See Seurat::FindClusters
					clusterParams =	list(
										resolution = c(0.2), 
										sampleCells = 10000, 
										n.start = 10
@@ -119,87 +129,126 @@ proj_CAD_2 <- addIterativeLSI(
					varFeatures = 25000, 
					dimsToUse = 1:30,
					seed=1,force=T
)

# basic clustering 
proj_CAD_2 <- addClusters(
    input = proj_CAD_2,
				) |>
				addClusters(
					reducedDims = "IterativeLSI",
					method = "Seurat",
					name = "Clusters",
					resolution = 0.8,
    force=T,seed=1
)


# UMAP embedding
proj_CAD_2 <- addUMAP(
    ArchRProj = proj_CAD_2, 
					force = T,
					seed = 1
				) |>
				addUMAP(
					reducedDims = "IterativeLSI", 
					name = "UMAP", 
					nNeighbors = 30, 
					minDist = 0.5, 
    metric = "cosine",force=T
)

p1 <- plotEmbedding(ArchRProj = proj_CAD_2, colorBy = "cellColData", name = "Sample", embedding = "UMAP")
p2 <- plotEmbedding(ArchRProj = proj_CAD_2, colorBy = "cellColData", name = "Clusters", embedding = "UMAP")
plotPDF(p1,p2, name = "Plot-UMAP-Sample-Clusters_LSI.pdf", ArchRProj = proj_CAD_2, addDOC = FALSE, width = 5, height = 5)

# QC score projected on UMAP
p1 <- plotEmbedding(ArchRProj = proj_CAD_2, colorBy = "cellColData", name = "DoubletEnrichment", embedding = "UMAP")
p2 <- plotEmbedding(ArchRProj = proj_CAD_2, colorBy = "cellColData", name = "DoubletScore", embedding = "UMAP")
p3 <- plotEmbedding(ArchRProj = proj_CAD_2, colorBy = "cellColData", name = "PromoterRatio", embedding = "UMAP")
p4 <- plotEmbedding(ArchRProj = proj_CAD_2, colorBy = "cellColData", name = "NucleosomeRatio", embedding = "UMAP")
p5 <- plotEmbedding(ArchRProj = proj_CAD_2, colorBy = "cellColData", name = "TSSEnrichment", embedding = "UMAP")
p6 <- plotEmbedding(ArchRProj = proj_CAD_2, colorBy = "cellColData", name = "nFrags", embedding = "UMAP")
plotPDF(p1,p2,p3,p4,p5,p6, name = "Plot-UMAP-Sample-QC_LSI.pdf", ArchRProj = proj_CAD_2, addDOC = FALSE, width = 5, height = 5)


# check batcheffect
proj_CAD_2 <- addHarmony(
    ArchRProj = proj_CAD_2,
					metric = "cosine",
					force = T
				) |>
				addHarmony(
					reducedDims = "IterativeLSI",
					name = "Harmony",
    groupBy = "Sample",force=T
					groupBy = "Sample",
					force=T
				)

proj_CAD_2_HAR <-	addClusters(
						input = proj_CAD_2,
						reducedDims = "Harmony",
						method = "Seurat",
						name = "Clusters",
						resolution = 0.8,
    force=T,seed=1
						force = T,
						seed = 1
					)

cM <- confusionMatrix(paste0(proj_CAD_2$Clusters), paste0(proj_CAD_2$Sample))
cM <- cM / Matrix::rowSums(cM)
p <- pheatmap::pheatmap(
    mat = as.matrix(cM), 
    color = paletteContinuous("whiteBlue"), 
    border_color = "black"
multi_pe_plot <- function(names,proj,file_name)
{
	lapply(
		names,
		function(x)
		{
			plotEmbedding(
				ArchRProj = proj,
				colorBy = "cellColData",
				name = x,
				embedding = "UMAP"
			)
		}
	)|>
	plotPDF(
		name = file_name,
		ArchRProj = proj,
		addDOC = FALSE,
		width = 5,
		height = 5
	)
}

# QC score projected on UMAP
multi_pe_plot(
	c(
		"Sample",
		"Clusters",
		"DoubletEnrichment",
		"DoubletScore",
		"PromoterRatio",
		"NucleosomeRatio",
		"TSSEnrichment",
		"nFrags"
	),
	proj_CAD_2,
	"5.Plot-UMAP-Sample_LSI.pdf"
)
plotPDF(p, name = "confusionMap_heatmap_LSI.pdf", ArchRProj = proj_CAD_2, addDOC = FALSE)

cM <- confusionMatrix(paste0(proj_CAD_2_HAR$Clusters), paste0(proj_CAD_2_HAR$Sample))

confusionmap <-	function(
					proj,
					file_name,
					addDOC = FALSE,
					width = NA,
					height = NA
){
	cM <-	confusionMatrix(
				paste0(proj$Clusters),
				paste0(proj$Sample)
			)
	cM <- cM / Matrix::rowSums(cM)
p2 <- pheatmap::pheatmap(
	pheatmap::pheatmap(
		mat = as.matrix(cM), 
		color = paletteContinuous("whiteBlue"), 
		border_color = "black"
	) |>
	plotPDF(
		name = file_name,
		ArchRProj = proj,
		addDOC = addDOC
	)
}

plotPDF(p, name = "confusionMap_heatmap_LSI.pdf", ArchRProj = proj_CAD_2, addDOC = FALSE)
plotPDF(p2, name = "confusionMap_heatmap_HAR.pdf", ArchRProj = proj_CAD_2, addDOC = FALSE)

p1 <- plotEmbedding(ArchRProj = proj_CAD_2_HAR, colorBy = "cellColData", name = "Sample", embedding = "UMAP")
p2 <- plotEmbedding(ArchRProj = proj_CAD_2_HAR, colorBy = "cellColData", name = "Clusters", embedding = "UMAP")
plotPDF(p1,p2, name = "Plot-UMAP-Sample-Clusters_HAR.pdf", ArchRProj = proj_CAD_2, addDOC = FALSE, width = 5, height = 5)
confusionmap(proj_CAD_2,"6.confusionMap_heatmap_LSI.pdf")
confusionmap(proj_CAD_2_HAR,"7.confusionMap_heatmap_HAR.pdf")

multi_pe_plot(
	c(
		"Sample",
		"Clusters",
		"DoubletEnrichment",
		"DoubletScore",
		"PromoterRatio",
		"NucleosomeRatio",
		"TSSEnrichment",
		"nFrags"
	),
	proj_CAD_2_HAR,
	"8.Plot-UMAP-Sample_HAR.pdf"
)

proj_CAD_2 <-	proj_CAD_2 |>
				addGeneScoreMatrix(force = TRUE) |>
				addImputeWeights(seed = 1)

# gene score with ArchR default method
proj_CAD_2<-addGeneScoreMatrix(proj_CAD_2,force=TRUE)
proj_CAD_2 <- addImputeWeights(proj_CAD_2,seed=1)

markersGS <-	getMarkerFeatures(
					ArchRProj = proj_CAD_2, 
@@ -225,10 +274,9 @@ driverSMC <- c("LGALS3","FN1","TNFRSF11B","COL1A1","COL4A1","COL4A2","COL6A3")
useMKG <- intersect(c(DSMCmarker,MSMCmarker,Emarker,Tmarker,Macrophage,PericyteMarker,Osteochondrogenic,PotentialSMC,driverSMC), uniq_symbol)
markerGenes <- useMKG#c(DSMCmarker,MSMCmarker,Emarker,Tmarker)


# integration with scRNAseq data
seRNA <- readRDS("scRNA_PC10.rds");
celltype_meta <- read.table("PC10_celltype_assignment.txt",row.names=1,header=T)
seRNA <- readRDS("scRNA_analysis_result/scRNA_PC10.rds");
celltype_meta <- fread("scRNA_analysis_result/PC10_celltype_assignment.txt")

CT1 <- as.vector(celltype_meta[,"celltype"])
CT1[which(celltype_meta[,"celltype"]=="T/NK")] <- "T_NK"
+13 −5
Original line number Diff line number Diff line
@@ -4,6 +4,8 @@ pacman::p_load(data.table,magrittr,Seurat,ggplot2,patchwork)
set.seed(1)

result_dir <- "scRNA_analysis_result"
if(!dir.exists(result_dir)){dir.create(result_dir)}


# the input data GSE131778_human_coronary_scRNAseq_wirka_et_al_GEO.txt was downloaded from GEO, GSE131778, https://ftp.ncbi.nlm.nih.gov/geo/series/GSE131nnn/GSE131778/suppl/GSE131778%5Fhuman%5Fcoronary%5FscRNAseq%5Fwirka%5Fet%5Fal%5FGEO%2Etxt%2Egz
data_raw <- fread(
@@ -35,8 +37,6 @@ plot1 + plot2
dev.off()

# QC cell filtering and normalization and highvar genes selection


genes_expCellNum <- apply(
						data_raw@assays$RNA@counts,
						1,
@@ -55,6 +55,11 @@ data_highVar <- data_raw[which(genes_expCellNum >=5),] |>
					selection.method = "vst",
					nfeatures = 2000
				)

top10_highVar_genes <-	data_highVar |>
						VariableFeatures() |>
						head(10)

pdf_save("2.highVar_genes_selection_scatter")
plot1 <- VariableFeaturePlot(data_highVar)
plot2 <- LabelPoints(plot = plot1, points = top10_highVar_genes, repel = TRUE)
@@ -85,9 +90,7 @@ dev.off()

# determine the dim
data_PCA_ex <-	data_PCA |> 
				JackStraw(
					num.replicate = 100
				) |>
				JackStraw(num.replicate = 100) |>
				ScoreJackStraw()

pdf_save("6.PCA_DiffDim_cmp_v1")
@@ -243,3 +246,8 @@ fwrite(
	sep = "\t",
	quote = F
)

saveRDS(
	data_UMAPtsne_K10,
	file = file.path(result_dir,"scRNA_PC10.rds")
)