rewrite with data.table and generate all plots to result dir (1827f28e) · Commits · Chaos / Coronary_snATAC

2_scRNA_analysis.r

+200 −123

Original line number	Diff line number	Diff line

		# scRNAseq data processing, mainly with Seurat package, version4.0.0
		library(dplyr)
		library(Seurat)
		library(patchwork)
		# scRNAseq data processing, mainly with Seurat package
		pacman::p_load(data.table,magrittr,Seurat,ggplot2,patchwork)
		set.seed(1)

		result_dir <- "scRNA_analysis_result"

		# the input data GSE131778_human_coronary_scRNAseq_wirka_et_al_GEO.txt was downloaded from GEO, GSE131778, https://ftp.ncbi.nlm.nih.gov/geo/series/GSE131nnn/GSE131778/suppl/GSE131778%5Fhuman%5Fcoronary%5FscRNAseq%5Fwirka%5Fet%5Fal%5FGEO%2Etxt%2Egz
		expmat <- read.table("GSE131778_human_coronary_scRNAseq_wirka_et_al_GEO.txt",row.names=1,header=T)
		data_raw <- CreateSeuratObject(counts = expmat, project = "CAscRNA", min.cells = 3, min.features = 200)
		data_raw <- fread(
		"GSE131778/GSE131778_human_coronary_scRNAseq_wirka_et_al_GEO.txt.gz"
		) %>%
		suppressWarnings() %>%
		setDF(rownames = .$V1) %>%
		.[-1] \|>
		CreateSeuratObject(
		project = "CAscRNA",
		min.cells = 3,
		min.features = 200
		)

		data_raw[["percent.mt"]] <- PercentageFeatureSet(data_raw, pattern = "^MT-")
		pdf(file="scRNA_basicQC.pdf",width=12,height=6)

		pdf_save <- function(file_name,width = 12, height = 6)
		{
		paste0(result_dir,"/",file_name,".pdf") \|>
		pdf(width = width, height = height)
		}


		pdf_save("1.scRNA_basicQC")
		VlnPlot(data_raw, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
		plot1 <- FeatureScatter(data_raw, feature1 = "nCount_RNA", feature2 = "percent.mt")
		plot2 <- FeatureScatter(data_raw, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
		plot1 + plot2
		dev.off()

		# QC cell filtering
		genes_expCellNum <- apply(data_raw@assays$RNA@counts,1,function(x) return( length(which(x>0)) ))
		data_filter_tmp <- data_raw[which(genes_expCellNum >=5),]
		data_filter <- subset(data_filter_tmp, subset = nFeature_RNA > 500 & nFeature_RNA < 3500 & percent.mt < 7.5)
		# QC cell filtering and normalization and highvar genes selection

		# normalization
		data_normalize <- NormalizeData(data_filter, normalization.method = "LogNormalize", scale.factor = 10000)

		# highvar genes selection
		data_highVar <- FindVariableFeatures(data_normalize, selection.method = "vst", nfeatures = 2000)
		pdf(file="highVar_genes_selection_scatter.pdf",width=12,height=6)
		genes_expCellNum <- apply(
		data_raw@assays$RNA@counts,
		1,
		function(x){which(x>0) \|> length()}
		)

		data_highVar <- data_raw[which(genes_expCellNum >=5),] \|>
		subset(
		subset = nFeature_RNA > 500 & nFeature_RNA < 3500 & percent.mt < 7.5
		) \|>
		NormalizeData(
		normalization.method = "LogNormalize",
		scale.factor = 10000
		) \|>
		FindVariableFeatures(
		selection.method = "vst",
		nfeatures = 2000
		)
		pdf_save("2.highVar_genes_selection_scatter")
		plot1 <- VariableFeaturePlot(data_highVar)
		plot2 <- LabelPoints(plot = plot1, points = top10_highVar_genes, repel = TRUE)
		plot1 + plot2
		dev.off()

		# scaling data
		all.genes <- rownames(data_highVar)
		data_scale <- ScaleData(data_highVar, features = all.genes)
		# scaling data and dimensional reduction
		data_PCA <- ScaleData(
		data_highVar,
		features = rownames(data_highVar)
		) %>%
		RunPCA(
		features = VariableFeatures(object = .)
		)

		# dimensional reduction
		data_PCA <- RunPCA(data_scale, features = VariableFeatures(object = data_scale))
		print(data_PCA[["pca"]], dims = 1:5, nfeatures = 5)
		pdf(file="PCA_loading_scatter.pdf",width=12,height=6)
		#print(data_PCA[["pca"]], dims = 1:5, nfeatures = 5)
		pdf_save("3.PCA_loading_scatter")
		VizDimLoadings(data_PCA, dims = 1:2, reduction = "pca")
		dev.off()

		pdf(file="PCA_reduction_scatter.pdf")
		pdf_save("4.PCA_reduction_scatter.pdf")
		DimPlot(data_PCA, reduction = "pca")
		dev.off()

		pdf(file="PCA_loading_heatmap.pdf",width=12,height=6)
		pdf_save("5.PCA_loading_heatmap")
		DimHeatmap(data_PCA, dims = 1:2, cells = 500, balanced = TRUE)
		dev.off()

		# determine the dim
		data_PCA_tmp <- JackStraw(data_PCA, num.replicate = 100,dims=1:25)
		data_PCA_ex <- ScoreJackStraw(data_PCA_tmp, dims = 1:25)
		pdf(file="PCA_DiffDim_cmp_v1.pdf",width=12,height=6)
		p1 <- JackStrawPlot(data_PCA_ex, dims = 1:15)
		data_PCA_ex <- data_PCA \|>
		JackStraw(
		num.replicate = 100
		) \|>
		ScoreJackStraw()

		pdf_save("6.PCA_DiffDim_cmp_v1")
		p1 <- JackStrawPlot(data_PCA_ex)
		p2 <- ElbowPlot(data_PCA_ex)
		p1 + p2
		dev.off()

		proportion_PCA_std <- data_PCA@reductions$pca@stdev/sum(data_PCA@reductions$pca@stdev)
		cdf_PCA_std <- c()
		cdf_this <- 0
		for(i in 1:length(proportion_PCA_std)){
		cdf_this <- cdf_this+proportion_PCA_std[i]
		cdf_PCA_std <- c(cdf_PCA_std, cdf_this)
		}
		cdf_PCA_std <- proportion_PCA_std \|> cumsum()

		pdf(file="PCA_DiffDim_cmp_v2.pdf",width=12,height=6)
		pdf_save("7.PCA_DiffDim_cmp_v2")
		par(mfrow=c(1,2),mar=c(4,4,2,2))
		plot(data_PCA@reductions$pca@stdev,pch=16,ylab="std",xlab="PC")
		plot(cdf_PCA_std,pch=16,ylab="CDF proportion of std",xlab="PC")
		abline(h=0.5,col="red")
		dev.off()

		# clustering
		data_KNN_K10 <- FindNeighbors(data_PCA, dims = 1:10)
		data_cluster_K10 <- FindClusters(data_KNN_K10, resolution = 0.5, random.seed=1)
		# clustering and UMAP/tsne embbeding
		data_UMAP_K10 <- data_PCA \|>
		FindNeighbors(dims = 1:10) \|>
		FindClusters(
		resolution = 0.5,
		random.seed = 1
		) \|>
		RunUMAP(
		dims = 1:10,
		seed.use = 1
		)

		# UMAP/tsne embbeding
		data_UMAP_K10 <- RunUMAP(data_cluster_K10, dims = 1:10, seed.use=1)
		all_marker_genes_K10 <- FindAllMarkers(data_UMAP_K10, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25, random.seed=1)
		MKG_K10 <- all_marker_genes_K10[which(all_marker_genes_K10[,"avg_logFC"]>=log(2) & all_marker_genes_K10[,"p_val_adj"]<1e-5),]
		data_UMAPtsne_K10 <- RunTSNE(data_UMAP_K10, dims = 1:10, seed.use=1)
		all_marker_genes_K10 <- FindAllMarkers(data_UMAP_K10, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25, random.seed=1)
		MKG_K10 <- all_marker_genes_K10[which(all_marker_genes_K10[,"avg_logFC"]>=log(2) & all_marker_genes_K10[,"p_val_adj"]<1e-5),]
		all_DEG_MKG_K10 <- data_UMAP_K10 \|>
		FindAllMarkers(
		only.pos = TRUE,
		min.pct = 0.25,
		logfc.threshold = 0.25,
		random.seed = 1
		)

		# read in marker gene list from downloaded from Wirka et al 2019, GSE131778
		Bcell_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Bcell.txt")[,1])
		Endothelial_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Endothelial.txt")[,1])
		Fibroblast_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Fibroblast.txt")[,1])
		Fibroblast2_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Fibroblast2.txt")[,1])
		Fibromyocyte_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Fibromyocyte.txt")[,1])
		Macrophage_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Macrophage.txt")[,1])
		Mast_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Mast.txt")[,1])
		NKcell_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/NKcell.txt")[,1])
		Neuron_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Neuron.txt")[,1])
		Pericyte1_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Pericyte1.txt")[,1])
		Pericyte2_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Pericyte2.txt")[,1])
		Plasma1_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Plasma1.txt")[,1])
		Plasma2_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Plasma2.txt")[,1])
		SMC_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/SMC.txt")[,1])
		Tcell_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Tcell.txt")[,1])
		c10_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/c10.txt")[,1])
		c11_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/c11.txt")[,1])
		c12_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/c12.txt")[,1])
		c14_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/c14.txt")[,1])
		c18_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/c18.txt")[,1])
		MKG_K10 <- as.data.table(
		all_DEG_MKG_K10
		)[
		avg_log2FC >= log(2) & p_val_adj < 1e-5
		]

		data_UMAPtsne_K10 <- data_UMAP_K10 \|>
		RunTSNE(
		dims = 1:10,
		seed.use = 1
		)

		# read in marker gene list from downloaded from Wirka et al 2019, GSE131778
		# the original table is in xlsx and was transfromed to txt
		# one gene in xlsx was shown as "2-sep",which is definitely wrong, I guess it should be "SEPT4"
		GSE_MKG <- fread(
		"GSE131778/cluster_marker_genes.txt",
		header = F
		) \|>
		setnames(c("cluster","gene"))


		## compare clusters with marker genes, generate confusion matrix

		compare_celltype <- function(usedata, outname, MKGdata){
		### compare with DEG
		DEGovMAT <- matrix(rep(0, 21*length(table(usedata$seurat_clusters))),ncol=21)
		rownames(DEGovMAT) <- names(table(usedata$seurat_clusters))
		colnames(DEGovMAT) <- c("Bcell","Endothelial","Fibroblast","Fibroblast2","Fibromyocyte","Macrophage","Mast","NKcell","Neuron","Pericyte1","Pericyte2","Plasma1","Plasma2","SMC","Tcell","c10","c11","c12","c14","c18","total")
		for(i in rownames(DEGovMAT)){
		MKG_thisCluster <- MKGdata[which(MKGdata[,"cluster"]==i),"gene"]
		DEGovMAT[i,"Bcell"] = length(intersect(MKG_thisCluster, Bcell_MKG))
		DEGovMAT[i,"Endothelial"] = length(intersect(MKG_thisCluster, Endothelial_MKG))
		DEGovMAT[i,"Fibroblast"] = length(intersect(MKG_thisCluster, Fibroblast_MKG))
		DEGovMAT[i,"Fibroblast2"] = length(intersect(MKG_thisCluster, Fibroblast2_MKG))
		DEGovMAT[i,"Fibromyocyte"] = length(intersect(MKG_thisCluster, Fibromyocyte_MKG))
		DEGovMAT[i,"Macrophage"] = length(intersect(MKG_thisCluster, Macrophage_MKG))
		DEGovMAT[i,"Mast"] = length(intersect(MKG_thisCluster, Mast_MKG))
		DEGovMAT[i,"NKcell"] = length(intersect(MKG_thisCluster, NKcell_MKG))
		DEGovMAT[i,"Neuron"] = length(intersect(MKG_thisCluster, Neuron_MKG))
		DEGovMAT[i,"Pericyte1"] = length(intersect(MKG_thisCluster, Pericyte1_MKG))
		DEGovMAT[i,"Pericyte2"] = length(intersect(MKG_thisCluster, Pericyte2_MKG))
		DEGovMAT[i,"Plasma1"] = length(intersect(MKG_thisCluster, Plasma1_MKG))
		DEGovMAT[i,"Plasma2"] = length(intersect(MKG_thisCluster, Plasma2_MKG))
		DEGovMAT[i,"SMC"] = length(intersect(MKG_thisCluster, SMC_MKG))
		DEGovMAT[i,"Tcell"] = length(intersect(MKG_thisCluster, Tcell_MKG))
		DEGovMAT[i,"c10"] = length(intersect(MKG_thisCluster, c10_MKG))
		DEGovMAT[i,"c11"] = length(intersect(MKG_thisCluster, c11_MKG))
		DEGovMAT[i,"c12"] = length(intersect(MKG_thisCluster, c12_MKG))
		DEGovMAT[i,"c14"] = length(intersect(MKG_thisCluster, c14_MKG))
		DEGovMAT[i,"c18"] = length(intersect(MKG_thisCluster, c18_MKG))
		DEGovMAT[i,"total"] = length(MKG_thisCluster)
		}
		write.table(DEGovMAT, file=paste0(outname,"_ownClusterDEG_vs_paperDEG.txt"),row.names=T,col.names=T,sep="\t", quote=F)
		DEGovGSE <- expand.grid(
		MKG_K10$cluster \|> unique(),
		GSE_MKG$cluster \|> unique(),
		stringsAsFactors = F
		) \|>
		apply(
		1,
		function(x){
		data.table(
		x[1],
		x[2],
		intersect(
		MKG_K10[cluster == x[1],gene],
		GSE_MKG[cluster == x[2],gene]
		) \|> length()
		)[
		,V4 := V3/MKG_K10[cluster == x[1],.N]
		]
		}

		compare_celltype(data_UMAP_K10, "PC10", MKG_K10)
		) \|>
		rbindlist() %>%
		.[,V1 := factor(V1,levels = levels(data_UMAP_K10$seurat_clusters))]

		# assign cell types to clusters based on confusion matrix comparing marker gene assignment.
		own_celltype <- c("Fibroblast","Endothelial","Macrophage","Fibro","T/NK","SMC","Pericyte1","unknown1","Pericyte2","B","Plasma","unknown2","Neuron","unknown3","Mast")

		# plot the matrix
		pdf_save("8.DEGovGSE_cluster_overlap",height = 10, width = 10)
		DEGovGSE \|>
		ggplot(aes(V1,V2,fill = V4)) +
		geom_tile() +
		theme(
		axis.title = element_blank(),
		legend.title = element_blank(),
		panel.background = element_blank(),
		axis.ticks = element_blank()
		) +
		scale_fill_distiller(palette = 4)
		dev.off()

		# check with the knowledge based marker genes for cell types
		DSMCmarker <- c("MYOCD","SRF","TEAD3","TEAD4","ACTA2","MYH11","TAGLN","LMOD1","CNN1","TPM2","MYL9")
		MSMCmarker <- c("TCF21","KLF4","FN1","LUM","TNFRSF11B","BGN")
		Emarker <- c("KLF2","PECAM1","CLDN5","PLVAP","ACKR1","EGFL7", "NFKB1","NFKB2","VCAM1","SELE")

		pdf(file="MKG_boxplot.pdf",width=24,height=24)
		VlnPlot(data_UMAP_K10, features = c(DSMCmarker,MSMCmarker,Emarker ))
		dev.off()
		pdf(file="MKG_scatter.pdf",width=40,height=40)
		FeaturePlot(data_UMAP_K10, features = c(DSMCmarker,MSMCmarker,Emarker ))
		pdf_save("9.MKG_boxplot",width = 24,height = 24)
		VlnPlot(
		data_UMAP_K10,
		features = c(DSMCmarker, MSMCmarker, Emarker)
		)
		dev.off()
		top10marker <- MKG_K10 %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
		pdf(file="top10marker_exp_heatmap.pdf")
		DoHeatmap(data_UMAP_K10, features = top10marker$gene) + NoLegend()

		pdf_save("10.MKG_scatter",width = 40,height = 40)
		FeaturePlot(
		data_UMAP_K10,
		features = c(DSMCmarker, MSMCmarker, Emarker)
		)
		dev.off()

		top10marker_gene <- setorder(
		MKG_K10,
		-avg_log2FC
		)[
		,head(.SD,10),
		.(cluster)
		][
		,gene
		]

		pdf_save("11.top10marker_exp_heatmap")
		DoHeatmap(
		data_UMAP_K10,
		features = top10marker_gene
		) +
		NoLegend()
		dev.off()


		# output processed scRNA data for integration analysis with scATAC
		own_cluster <- as.vector(data_UMAP_K10$seurat_clusters)
		names(own_cluster) <- names(data_UMAP_K10$seurat_clusters)

		out_celltype <- matrix(rep(0, 2*length(own_cluster)),ncol=2)
		rownames(out_celltype) <- names(own_cluster)
		colnames(out_celltype)<-c("cluster","celltype")
		idx <- rownames(out_celltype)
		out_celltype[idx,"cluster"] <- own_cluster[idx]
		#out_celltype[idx,"celltype"] <-
		for(i in names(table(out_celltype[,"cluster"]))){
		out_celltype[which(out_celltype[,"cluster"]==i),"celltype"] <- own_celltype[as.numeric(i)+1]
		}
		own_cluster <- data_UMAP_K10$seurat_clusters %>%
		as.data.frame() %>%
		setDT(keep.rownames = "barcode") %>%
		setnames(".","cluster") %>%
		setorder(cluster) %>%
		.[,cell_type := own_celltype[cluster]]

		write.table(out_celltype,file="PC10_celltype_assignment.txt",row.names=T,col.names=T,sep="\t",quote=F)
		saveRDS(data_UMAPtsne_K10, file="scRNA_PC10.rds")
		fwrite(
		own_cluster,
		file.path(result_dir,"PC10_celltype_assignment.txt"),
		col.names = T,
		sep = "\t",
		quote = F
		)

		save(
		all_DEG_MKG_K10,
		cdf_PCA_std,
		data_highVar,
		data_PCA,
		data_PCA_ex,
		data_raw,
		data_UMAP_K10,
		data_UMAPtsne_K10,
		DEGovGSE,
		MSMCmarker,
		DSMCmarker,
		Emarker,
		genes_expCellNum,
		GSE_MKG,
		MKG_K10,
		own_cluster,
		proportion_PCA_std,
		file = file.path(result_dir,"scRNA.rData")
		)