Commit 1827f28e authored by Chaos's avatar Chaos
Browse files

rewrite with data.table and generate all plots to result dir

parent fd800029
Loading
Loading
Loading
Loading
+200 −123
Original line number Diff line number Diff line

# scRNAseq data processing, mainly with Seurat package, version4.0.0
library(dplyr)
library(Seurat)
library(patchwork)
# scRNAseq data processing, mainly with Seurat package
pacman::p_load(data.table,magrittr,Seurat,ggplot2,patchwork)
set.seed(1)

result_dir <- "scRNA_analysis_result"

# the input data GSE131778_human_coronary_scRNAseq_wirka_et_al_GEO.txt was downloaded from GEO, GSE131778, https://ftp.ncbi.nlm.nih.gov/geo/series/GSE131nnn/GSE131778/suppl/GSE131778%5Fhuman%5Fcoronary%5FscRNAseq%5Fwirka%5Fet%5Fal%5FGEO%2Etxt%2Egz
expmat <- read.table("GSE131778_human_coronary_scRNAseq_wirka_et_al_GEO.txt",row.names=1,header=T)
data_raw <- CreateSeuratObject(counts = expmat, project = "CAscRNA", min.cells = 3, min.features = 200)
data_raw <- fread(
				"GSE131778/GSE131778_human_coronary_scRNAseq_wirka_et_al_GEO.txt.gz"
			) %>%
			suppressWarnings() %>%
			setDF(rownames = .$V1) %>%
			.[-1] |> 
			CreateSeuratObject(
				project = "CAscRNA", 
				min.cells = 3, 
				min.features = 200
			)

data_raw[["percent.mt"]] <- PercentageFeatureSet(data_raw, pattern = "^MT-")
pdf(file="scRNA_basicQC.pdf",width=12,height=6)

pdf_save <- function(file_name,width = 12, height = 6)
{
	paste0(result_dir,"/",file_name,".pdf") |>
	pdf(width = width, height = height)
}


pdf_save("1.scRNA_basicQC")
VlnPlot(data_raw, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
plot1 <- FeatureScatter(data_raw, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(data_raw, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
dev.off()

# QC cell filtering
genes_expCellNum <- apply(data_raw@assays$RNA@counts,1,function(x) return( length(which(x>0)) ))
data_filter_tmp <- data_raw[which(genes_expCellNum >=5),]
data_filter <- subset(data_filter_tmp, subset = nFeature_RNA > 500 & nFeature_RNA < 3500 & percent.mt < 7.5)
# QC cell filtering and normalization and highvar genes selection

# normalization
data_normalize <- NormalizeData(data_filter, normalization.method = "LogNormalize", scale.factor = 10000)

# highvar genes selection
data_highVar <- FindVariableFeatures(data_normalize, selection.method = "vst", nfeatures = 2000)
pdf(file="highVar_genes_selection_scatter.pdf",width=12,height=6)
genes_expCellNum <- apply(
						data_raw@assays$RNA@counts,
						1,
						function(x){which(x>0) |> length()}
					)

data_highVar <- data_raw[which(genes_expCellNum >=5),] |>
				subset(
					subset = nFeature_RNA > 500 & nFeature_RNA < 3500 & percent.mt < 7.5
				) |>
				NormalizeData(
					normalization.method = "LogNormalize",
					scale.factor = 10000
				) |>
				FindVariableFeatures(
					selection.method = "vst",
					nfeatures = 2000
				)
pdf_save("2.highVar_genes_selection_scatter")
plot1 <- VariableFeaturePlot(data_highVar)
plot2 <- LabelPoints(plot = plot1, points = top10_highVar_genes, repel = TRUE)
plot1 + plot2
dev.off()

# scaling data
all.genes <- rownames(data_highVar)
data_scale <- ScaleData(data_highVar, features = all.genes)
# scaling data and dimensional reduction
data_PCA <-	ScaleData(
				data_highVar,
				features = rownames(data_highVar)
			) %>%
			RunPCA(
				features = VariableFeatures(object = .)
			)

# dimensional reduction
data_PCA <- RunPCA(data_scale, features = VariableFeatures(object = data_scale))
print(data_PCA[["pca"]], dims = 1:5, nfeatures = 5)
pdf(file="PCA_loading_scatter.pdf",width=12,height=6)
#print(data_PCA[["pca"]], dims = 1:5, nfeatures = 5)
pdf_save("3.PCA_loading_scatter")
VizDimLoadings(data_PCA, dims = 1:2, reduction = "pca")
dev.off()

pdf(file="PCA_reduction_scatter.pdf")
pdf_save("4.PCA_reduction_scatter.pdf")
DimPlot(data_PCA, reduction = "pca")
dev.off()

pdf(file="PCA_loading_heatmap.pdf",width=12,height=6)
pdf_save("5.PCA_loading_heatmap")
DimHeatmap(data_PCA, dims = 1:2, cells = 500, balanced = TRUE)
dev.off()

# determine the dim
data_PCA_tmp <- JackStraw(data_PCA, num.replicate = 100,dims=1:25)
data_PCA_ex <- ScoreJackStraw(data_PCA_tmp, dims = 1:25)
pdf(file="PCA_DiffDim_cmp_v1.pdf",width=12,height=6)
p1 <- JackStrawPlot(data_PCA_ex, dims = 1:15)
data_PCA_ex <-	data_PCA |> 
				JackStraw(
					num.replicate = 100
				) |>
				ScoreJackStraw()

pdf_save("6.PCA_DiffDim_cmp_v1")
p1 <- JackStrawPlot(data_PCA_ex)
p2 <- ElbowPlot(data_PCA_ex)
p1 + p2
dev.off()

proportion_PCA_std <- data_PCA@reductions$pca@stdev/sum(data_PCA@reductions$pca@stdev)
cdf_PCA_std <- c()
cdf_this <- 0
for(i in 1:length(proportion_PCA_std)){
    cdf_this <- cdf_this+proportion_PCA_std[i]
    cdf_PCA_std <- c(cdf_PCA_std, cdf_this)
}
cdf_PCA_std <-	proportion_PCA_std |> cumsum()

pdf(file="PCA_DiffDim_cmp_v2.pdf",width=12,height=6)
pdf_save("7.PCA_DiffDim_cmp_v2")
par(mfrow=c(1,2),mar=c(4,4,2,2))
plot(data_PCA@reductions$pca@stdev,pch=16,ylab="std",xlab="PC")
plot(cdf_PCA_std,pch=16,ylab="CDF proportion of std",xlab="PC")
abline(h=0.5,col="red")
dev.off()

#  clustering
data_KNN_K10 <- FindNeighbors(data_PCA, dims = 1:10)
data_cluster_K10 <- FindClusters(data_KNN_K10, resolution = 0.5, random.seed=1)
#  clustering and UMAP/tsne embbeding
data_UMAP_K10 <-	data_PCA |>
					FindNeighbors(dims = 1:10) |>
					FindClusters(
						resolution = 0.5,
						random.seed = 1
					) |>
					RunUMAP(
						dims = 1:10,
						seed.use = 1
					)

# UMAP/tsne embbeding
data_UMAP_K10 <- RunUMAP(data_cluster_K10, dims = 1:10, seed.use=1)
all_marker_genes_K10 <- FindAllMarkers(data_UMAP_K10, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25, random.seed=1)
MKG_K10 <- all_marker_genes_K10[which(all_marker_genes_K10[,"avg_logFC"]>=log(2) & all_marker_genes_K10[,"p_val_adj"]<1e-5),]
data_UMAPtsne_K10 <- RunTSNE(data_UMAP_K10, dims = 1:10, seed.use=1)
all_marker_genes_K10 <- FindAllMarkers(data_UMAP_K10, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25, random.seed=1)
MKG_K10 <- all_marker_genes_K10[which(all_marker_genes_K10[,"avg_logFC"]>=log(2) & all_marker_genes_K10[,"p_val_adj"]<1e-5),]
all_DEG_MKG_K10 <- data_UMAP_K10 |>
						FindAllMarkers(
							only.pos = TRUE,
							min.pct = 0.25,
							logfc.threshold = 0.25,
							random.seed = 1
						)

# read in marker gene list from downloaded from Wirka et al 2019, GSE131778
Bcell_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Bcell.txt")[,1])
Endothelial_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Endothelial.txt")[,1])
Fibroblast_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Fibroblast.txt")[,1])
Fibroblast2_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Fibroblast2.txt")[,1])
Fibromyocyte_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Fibromyocyte.txt")[,1])
Macrophage_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Macrophage.txt")[,1])
Mast_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Mast.txt")[,1])
NKcell_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/NKcell.txt")[,1])
Neuron_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Neuron.txt")[,1])
Pericyte1_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Pericyte1.txt")[,1])
Pericyte2_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Pericyte2.txt")[,1])
Plasma1_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Plasma1.txt")[,1])
Plasma2_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Plasma2.txt")[,1])
SMC_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/SMC.txt")[,1])
Tcell_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/Tcell.txt")[,1])
c10_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/c10.txt")[,1])
c11_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/c11.txt")[,1])
c12_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/c12.txt")[,1])
c14_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/c14.txt")[,1])
c18_MKG <- as.vector(read.table("scRNA_GSE131778_makergene_list/c18.txt")[,1])
MKG_K10 <-	as.data.table(
				all_DEG_MKG_K10
			)[
				avg_log2FC >= log(2) & p_val_adj < 1e-5
			]

data_UMAPtsne_K10 <-	data_UMAP_K10 |>
						RunTSNE(
							dims = 1:10,
							seed.use = 1
						)

# read in marker gene list from downloaded from Wirka et al 2019, GSE131778
# the original table is in xlsx and was transfromed to txt
# one gene in xlsx was shown as "2-sep",which is definitely wrong, I guess it should be "SEPT4"
GSE_MKG <-	fread(
				"GSE131778/cluster_marker_genes.txt",
				header = F
			) |>
			setnames(c("cluster","gene"))


## compare clusters with marker genes, generate confusion matrix  

compare_celltype <- function(usedata, outname, MKGdata){
    ### compare with DEG
    DEGovMAT <- matrix(rep(0, 21*length(table(usedata$seurat_clusters))),ncol=21)
    rownames(DEGovMAT) <- names(table(usedata$seurat_clusters))
    colnames(DEGovMAT) <- c("Bcell","Endothelial","Fibroblast","Fibroblast2","Fibromyocyte","Macrophage","Mast","NKcell","Neuron","Pericyte1","Pericyte2","Plasma1","Plasma2","SMC","Tcell","c10","c11","c12","c14","c18","total")
    for(i in rownames(DEGovMAT)){
        MKG_thisCluster <- MKGdata[which(MKGdata[,"cluster"]==i),"gene"]
        DEGovMAT[i,"Bcell"] = length(intersect(MKG_thisCluster, Bcell_MKG))
        DEGovMAT[i,"Endothelial"] = length(intersect(MKG_thisCluster, Endothelial_MKG))
        DEGovMAT[i,"Fibroblast"] = length(intersect(MKG_thisCluster, Fibroblast_MKG))
        DEGovMAT[i,"Fibroblast2"] = length(intersect(MKG_thisCluster, Fibroblast2_MKG))
        DEGovMAT[i,"Fibromyocyte"] = length(intersect(MKG_thisCluster, Fibromyocyte_MKG))
        DEGovMAT[i,"Macrophage"] = length(intersect(MKG_thisCluster, Macrophage_MKG))
        DEGovMAT[i,"Mast"] = length(intersect(MKG_thisCluster, Mast_MKG))
        DEGovMAT[i,"NKcell"] = length(intersect(MKG_thisCluster, NKcell_MKG))
        DEGovMAT[i,"Neuron"] = length(intersect(MKG_thisCluster, Neuron_MKG))
        DEGovMAT[i,"Pericyte1"] = length(intersect(MKG_thisCluster, Pericyte1_MKG))
        DEGovMAT[i,"Pericyte2"] = length(intersect(MKG_thisCluster, Pericyte2_MKG))
        DEGovMAT[i,"Plasma1"] = length(intersect(MKG_thisCluster, Plasma1_MKG))
        DEGovMAT[i,"Plasma2"] = length(intersect(MKG_thisCluster, Plasma2_MKG))
        DEGovMAT[i,"SMC"] = length(intersect(MKG_thisCluster, SMC_MKG))
        DEGovMAT[i,"Tcell"] = length(intersect(MKG_thisCluster, Tcell_MKG))
        DEGovMAT[i,"c10"] = length(intersect(MKG_thisCluster, c10_MKG))
        DEGovMAT[i,"c11"] = length(intersect(MKG_thisCluster, c11_MKG))
        DEGovMAT[i,"c12"] = length(intersect(MKG_thisCluster, c12_MKG))
        DEGovMAT[i,"c14"] = length(intersect(MKG_thisCluster, c14_MKG))
        DEGovMAT[i,"c18"] = length(intersect(MKG_thisCluster, c18_MKG))
        DEGovMAT[i,"total"] = length(MKG_thisCluster)
    }
    write.table(DEGovMAT, file=paste0(outname,"_ownClusterDEG_vs_paperDEG.txt"),row.names=T,col.names=T,sep="\t", quote=F)   
DEGovGSE <-	expand.grid(
				MKG_K10$cluster |> unique(),
				GSE_MKG$cluster |> unique(),
				stringsAsFactors = F
			) |>
			apply(
				1,
				function(x){
					data.table(
						x[1],
						x[2],
						intersect(
							MKG_K10[cluster == x[1],gene],
							GSE_MKG[cluster == x[2],gene]
						) |> length()
					)[
						,V4 := V3/MKG_K10[cluster == x[1],.N]
					]
				}

compare_celltype(data_UMAP_K10, "PC10", MKG_K10)
			) |>
			rbindlist() %>%
			.[,V1 := factor(V1,levels = levels(data_UMAP_K10$seurat_clusters))]

# assign cell types to clusters based on confusion matrix comparing marker gene assignment. 
own_celltype <- c("Fibroblast","Endothelial","Macrophage","Fibro","T/NK","SMC","Pericyte1","unknown1","Pericyte2","B","Plasma","unknown2","Neuron","unknown3","Mast")

# plot the matrix
pdf_save("8.DEGovGSE_cluster_overlap",height = 10, width = 10)
DEGovGSE |>
ggplot(aes(V1,V2,fill = V4)) + 
geom_tile() + 
theme(
	axis.title = element_blank(),
	legend.title = element_blank(),
	panel.background = element_blank(),
	axis.ticks = element_blank()
) + 
scale_fill_distiller(palette = 4)
dev.off()

# check with the knowledge based marker genes for cell  types
DSMCmarker <- c("MYOCD","SRF","TEAD3","TEAD4","ACTA2","MYH11","TAGLN","LMOD1","CNN1","TPM2","MYL9")
MSMCmarker <- c("TCF21","KLF4","FN1","LUM","TNFRSF11B","BGN")
Emarker <- c("KLF2","PECAM1","CLDN5","PLVAP","ACKR1","EGFL7", "NFKB1","NFKB2","VCAM1","SELE")

pdf(file="MKG_boxplot.pdf",width=24,height=24)
VlnPlot(data_UMAP_K10, features = c(DSMCmarker,MSMCmarker,Emarker ))
dev.off()
pdf(file="MKG_scatter.pdf",width=40,height=40)
FeaturePlot(data_UMAP_K10, features = c(DSMCmarker,MSMCmarker,Emarker ))
pdf_save("9.MKG_boxplot",width = 24,height = 24)
VlnPlot(
	data_UMAP_K10,
	features = c(DSMCmarker, MSMCmarker, Emarker)
)
dev.off()
top10marker <- MKG_K10 %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
pdf(file="top10marker_exp_heatmap.pdf")
DoHeatmap(data_UMAP_K10, features = top10marker$gene) + NoLegend()

pdf_save("10.MKG_scatter",width = 40,height = 40)
FeaturePlot(
	data_UMAP_K10,
	features = c(DSMCmarker, MSMCmarker, Emarker)
)
dev.off()

top10marker_gene <-	setorder(
						MKG_K10,
						-avg_log2FC
					)[
						,head(.SD,10),
						.(cluster)
					][
						,gene
					]

pdf_save("11.top10marker_exp_heatmap")
DoHeatmap(
	data_UMAP_K10,
	features = top10marker_gene
) +
NoLegend()
dev.off()


# output processed scRNA data for integration analysis with scATAC
own_cluster <- as.vector(data_UMAP_K10$seurat_clusters)
names(own_cluster) <- names(data_UMAP_K10$seurat_clusters)

out_celltype <- matrix(rep(0, 2*length(own_cluster)),ncol=2)
rownames(out_celltype) <- names(own_cluster)
colnames(out_celltype)<-c("cluster","celltype")
idx <- rownames(out_celltype)
out_celltype[idx,"cluster"] <- own_cluster[idx]
#out_celltype[idx,"celltype"] <- 
for(i in names(table(out_celltype[,"cluster"]))){
    out_celltype[which(out_celltype[,"cluster"]==i),"celltype"] <- own_celltype[as.numeric(i)+1]
}
own_cluster <-	data_UMAP_K10$seurat_clusters %>%
				as.data.frame() %>%
				setDT(keep.rownames = "barcode") %>%
				setnames(".","cluster") %>%
				setorder(cluster) %>%
				.[,cell_type := own_celltype[cluster]]

write.table(out_celltype,file="PC10_celltype_assignment.txt",row.names=T,col.names=T,sep="\t",quote=F)
saveRDS(data_UMAPtsne_K10, file="scRNA_PC10.rds")
fwrite(
	own_cluster,
	file.path(result_dir,"PC10_celltype_assignment.txt"),
	col.names = T,
	sep = "\t",
	quote = F
)

save(
	all_DEG_MKG_K10,
	cdf_PCA_std,
	data_highVar,
	data_PCA,
	data_PCA_ex,
	data_raw,
	data_UMAP_K10,
	data_UMAPtsne_K10,
	DEGovGSE,
	MSMCmarker,
	DSMCmarker,
	Emarker,
	genes_expCellNum,
	GSE_MKG,
	MKG_K10,
	own_cluster,
	proportion_PCA_std,
	file = file.path(result_dir,"scRNA.rData")
)
+21.8 MiB

File added.

No diff preview for this file type.

+1970 −0

File added.

Preview size limit exceeded, changes collapsed.

+3.13 MiB

File added.

No diff preview for this file type.

+17.4 MiB

File added.

No diff preview for this file type.

Loading