update to DAseq package (7b2255cf) · Commits · github_fork / DAseq2

DA-seq_melanoma.R

deleted100644 → 0

+0 −276

Original line number	Diff line number	Diff line
		### DA-seq applied on immune cells from melanoma patients
		# Author: Jun Zhao (jun.zhao@yale.edu)

		# Original paper: https://www.sciencedirect.com/science/article/pii/S0092867418313941

		library(Seurat)
		library(reshape2)

		#! Please set working directory to your local github repo of DA-seq
		source("./DA-seq.R")
		dir.create("./plots/")



		## Set some parameters

		# Seurat version, default 2, can also be 3
		Sversion <- 2

		# set the python path to use
		python <- "/usr/bin/python"

		# set GPU number to use
		GPU <- ""



		## Data preprocessing

		# load data
		download.file(
		"ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE120nnn/GSE120575/suppl/GSE120575_Sade_Feldman_melanoma_single_cells_TPM_GEO.txt.gz",
		"./data/GSE120575_Sade_Feldman_melanoma_single_cells_TPM_GEO.txt.gz"
		)
		data_exp <- read.table(
		"./data/GSE120575_Sade_Feldman_melanoma_single_cells_TPM_GEO.txt.gz",
		sep = "\t", header = F, row.names = 1, stringsAsFactors = F, skip = 2
		)
		data_exp <- data_exp[,-16292]

		data_colInfo <- read.table(
		"./data/GSE120575_Sade_Feldman_melanoma_single_cells_TPM_GEO.txt.gz",
		sep = "\t", header = F, stringsAsFactors = F, nrows = 2
		)
		data_colInfo <- data_colInfo[,-1]

		colnames(data_exp) <- data_colInfo[1,]

		data_exp <- Matrix(as.matrix(data_exp), sparse = T)


		# patient info
		download.file(
		"ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE120nnn/GSE120575/suppl/GSE120575_patient_ID_single_cells.txt.gz",
		"./data/GSE120575_patient_ID_single_cells.txt.gz"
		)
		patient_info <- read.table(
		"./data/GSE120575_patient_ID_single_cells.txt.gz",
		sep = "\t", header = T, stringsAsFactors = F, skip = 19, nrows = 16291
		)
		mean(colnames(data_exp) == patient_info$title)
		rownames(patient_info) <- patient_info$title


		# cluster info
		cluster_info <- read.table(
		"./data/cluster_info", sep = "\t", header = T, stringsAsFactors = F
		)
		rownames(cluster_info) <- cluster_info$Cell.Name


		# patient lesion info
		lesion_info <- read.table(
		"./data/patient_lesion", sep = "\t", header = T, stringsAsFactors = F
		)


		# Seurat
		if(Sversion == 2){
		data_S <- CreateSeuratObject(
		raw.data = data_exp, project = "melanoma.immune"
		)
		} else if(Sversion == 3){
		data_S <- CreateSeuratObject(
		counts = data_exp, project = "melanoma.immune"
		)
		} else {
		warning("Please set Sversion to 2 or 3!")
		}


		# set metadata for each cell
		data_S@meta.data$condition <- patient_info[colnames(data_exp), "characteristics..response"]
		data_S@meta.data$lesion <- patient_info[colnames(data_exp),
		"characteristics..patinet.ID..Pre.baseline..Post..on.treatment."]
		data_S@meta.data$cluster <- cluster_info$Cluster.number

		data_S <- ScaleData(data_S)


		# calculate gene variance to set variable genes
		gene_var <- apply(data_exp, 1, var)
		if(Sversion == 2){
		data_S@var.genes <- names(gene_var)[gene_var > 6]
		} else if(Sversion == 3){
		data_S@assays$RNA@var.features <- names(gene_var)[gene_var > 6]
		} else {
		warning("Please set Sversion to 2 or 3!")
		}


		# dimension reduction
		if(Sversion == 2){
		data_S <- RunPCA(data_S, pcs.compute = 10, do.print = F)
		data_S <- RunTSNE(data_S, dims.use = 1:10, seed.use = 1)
		TSNEPlot(data_S, group.by = "condition", pt.size = 0.5)
		TSNEPlot(data_S, group.by = "cluster", do.label = T, pt.size = 0.5)
		pca_embedding <- data_S@dr$pca@cell.embeddings
		tsne_embedding <- data_S@dr$tsne@cell.embeddings
		} else if(Sversion == 3){
		data_S <- RunPCA(data_S, npcs = 10, assay = "RNA", features = VariableFeatures(data_S), verbose = F)
		data_S <- RunTSNE(data_S, dims = 1:10, seed.use = 1)
		TSNEPlot(data_S, group.by = "condition", pt.size = 0.5)
		TSNEPlot(data_S, group.by = "cluster", label = T, pt.size = 0.5)
		pca_embedding <- data_S@reductions$pca@cell.embeddings
		tsne_embedding <- data_S@reductions$tsne@cell.embeddings
		} else {
		warning("Please set Sversion to 2 or 3!")
		}



		## DA-seq

		# set k values to use
		k.vector <- seq(50, 500, 50)


		# sample labels for responders and non-responders
		labels_res <- lesion_info[lesion_info$Response.status..R.responder..NR.non.responder == "R","Sample.name"]
		labels_nonres <- lesion_info[lesion_info$Response.status..R.responder..NR.non.responder == "NR","Sample.name"]


		# find top DA cells
		da_cells <- getDAcells(
		X = pca_embedding,
		cell.labels = data_S@meta.data$lesion,
		labels.1 = labels_res,
		labels.2 = labels_nonres,
		k.vector = k.vector,
		k.folds = 10, n.runs = 10, pred.thres = c(0.075,0.925),
		plot.embedding = tsne_embedding,
		python.use = python,
		source.code = "./DA_logit.py", GPU = GPU
		)

		da_cells <- updateDAcells(
		X = da_cells, pred.thres = c(0.075,0.925),
		do.plot = T, plot.embedding = tsne_embedding
		)

		da_cells$pred.plot
		da_cells$da.cells.plot

		ggsave(da_cells$pred.plot + xlab("tSNE_1") + ylab("tSNE_2"),
		filename = "./plots/melanoma_prediction.pdf", width = 5, height = 4.5)
		ggsave(da_cells$da.cells.plot + xlab("tSNE_1") + ylab("tSNE_2"),
		filename = "./plots/melanoma_DA_cells.pdf", width = 4.5, height = 4.5)



		# cluster DA cells to get DA regions
		da_regions <- getDAregion(
		X = pca_embedding,
		cell.idx = da_cells$da.cell.idx,
		k = 5, alpha = 0.25, iter.max = 30,
		cell.labels = data_S@meta.data$lesion,
		labels.1 = labels_res,
		labels.2 = labels_nonres,
		plot.embedding = tsne_embedding
		)
		da_regions$da.region.plot
		da_regions$DA.stat

		ggsave(da_regions$da.region.plot + xlab("tSNE_1") + ylab("tSNE_2"),
		filename = "./plots/melanoma_DA_regions.pdf", width = 5, height = 4.5)
		write.table(da_regions$DA.stat, file = "./plots/melanoma_DA_region_stats.txt", sep = "\t",
		col.names = T, row.names = F, quote = F)


		n.da <- length(unique(da_regions$cluster.res)) - 1


		# STG marker finder for DA regions
		STG.markers <- STGmarkerFinder(
		X = as.matrix(data_S@data),
		cell.idx = da_cells$da.cell.idx,
		da.region.label = da_regions$cluster.res,
		lambda = 1.5, n.runs = 5,
		python.use = "/path/to/your/python",
		source.code = "./DA_STG.py"
		)

		head(STG.markers$da.markers[["1"]])


		# get markers as well as the model
		STG.genes.model <- STGmarkerFinder(
		X = as.matrix(data_S@data),
		cell.idx = da_cells$da.cell.idx,
		da.region.label = da_regions$cluster.res,
		lambda = 1.2, n.runs = 5, return.model = T,
		python.use = "/path/to/your/python",
		source.code = "./DA_STG.py"
		)

		# plot linear predictions from the model for DA region 5
		gg <- plotCellScore(
		X = tsne_embedding,
		score = STG.genes.model$model[["5"]][["pred"]],
		cell.col = viridis_pal()(10)
		)
		ggsave(gg + xlab("tSNE_1") + ylab("tSNE_2"),
		filename = "./plots/melanoma_DA5_STGlinearPred.pdf", width = 5, height = 4.5)



		## Interpret DA regions

		# plot some markers
		marker_genes <- c(
		"CD19","MS4A1","IGHM","CD79A",
		"VCAM1","LAG3","CD27","CD38",
		"CD14","CSF3R","VCAN","LYZ",
		"IL7R","TCF7","CD8A","CCL5",
		"CCR7","LEF1","SELL"
		)

		# add "da" slot
		data_S@meta.data$da <- 0
		data_S@meta.data$da[da_cells$da.cell.idx] <- da_regions$cluster.res


		# add STG information
		STG.marker.info <- do.call(rbind, lapply(STG.genes.model$da.markers, function(x,inputgenes){
		as.numeric(inputgenes %in% x$gene)
		}, inputgenes = rev(marker_genes)))
		STG.marker.info <- rbind(0, STG.marker.info)
		colnames(STG.marker.info) <- rev(marker_genes)
		rownames(STG.marker.info) <- c(1:(n.da+1))
		STG.marker.info[STG.marker.info == 0] <- NA

		STG.marker.info.m <- melt(STG.marker.info)
		STG.marker.info.m <- STG.marker.info.m[-which(is.na(STG.marker.info.m$value)),]
		STG.marker.info.m$value <- 10 * STG.marker.info.m$value


		# create dot plot with Seurat
		if(Sversion == 2){
		STG.marker.info.m$value <- STG.marker.info.m$value / 100
		ggdot <- DotPlot(
		data_S, genes.plot = marker_genes, cols.use = c("gray","red"), group.by = "da",
		x.lab.rot = T, do.return = T
		)
		} else if(Sversion == 3){
		ggdot <- DotPlot(data_S_new, features = marker_genes, cols = c("gray","red"), group.by = "da") +
		RotatedAxis()
		} else {
		warning("Please set Sversion to 2 or 3!")
		}

		gg <- ggdot + geom_point(data = STG.marker.info.m, aes(x = Var2, y = Var1, size = value))
		ggsave(plot = gg, filename = "./plots/melanoma_DotPlot.pdf", width = 10, height = 5)

DAseq.Rproj

0 → 100644

+20 −0

Original line number	Diff line number	Diff line
		Version: 1.0

		RestoreWorkspace: Default
		SaveWorkspace: Default
		AlwaysSaveHistory: Default

		EnableCodeIndexing: Yes
		UseSpacesForTab: Yes
		NumSpacesForTab: 2
		Encoding: UTF-8

		RnwWeave: Sweave
		LaTeX: pdfLaTeX

		AutoAppendNewline: Yes
		StripTrailingWhitespace: Yes

		BuildType: Package
		PackageUseDevtools: Yes
		PackageInstallArgs: --no-multiarch --with-keep.source

DESCRIPTION

0 → 100644

+22 −0

Original line number	Diff line number	Diff line
		Package: DAseq
		Type: Package
		Title: Detecting regions of differential abundance between scRNA-seq datasets
		Version: 1.0.1
		Author: Jun Zhao <jun.zhao@yale.edu>
		Maintainer: Jun Zhao <jun.zhao@yale.edu>
		Description: DA-seq is a multiscale approach for detecting DA subpopulations.
		In contrast to clustering based approaches, our method can detect DA subpopulations that do not form
		well separated clusters.
		For each cell, we compute a multiscale differential abundance score measure.
		These scores are based on the k nearest neighbors in the transcriptome space for various values of k.
		License:
		Encoding: UTF-8
		LazyData: true
		RoxygenNote: 6.1.1
		Imports:
		RANN,
		ggplot2,
		cowplot,
		reticulate,
		scales,
		tclust

NAMESPACE

0 → 100644

+14 −0

Original line number	Diff line number	Diff line
		# Generated by roxygen2: do not edit by hand

		export(STGmarkerFinder)
		export(getDAcells)
		export(getDAregion)
		export(plotCellScore)
		export(runSTG)
		export(updateDAcells)
		import(RANN)
		import(cowplot)
		import(ggplot2)
		import(reticulate)
		import(scales)
		import(tclust)

R/DAseq.R

0 → 100644

+14 −0

Original line number	Diff line number	Diff line
		#' DAseq: Detecting regions of differential abundance between scRNA-seq datasets
		#'
		#' @description DA-seq is a multiscale approach for detecting DA subpopulations.
		#' In contrast to clustering based approaches, our method can detect DA subpopulations that do not form
		#' well separated clusters.
		#' For each cell, we compute a multiscale differential abundance score measure.
		#' These scores are based on the k nearest neighbors in the transcriptome space for various values of k.
		#'
		#' @author Jun Zhao, Ariel Jaffe, Henry Li, Ofir Lindenbaum, Xiuyuan Cheng, Yuval Kluger
		#'
		#' @docType package
		#' @name DAseq
		NULL
		#> NULL

Admin message