update to glmnet (402ff54b) · Commits · github_fork / DAseq2

R/getDAcells.R

+149 −108

Original line number	Diff line number	Diff line
		@@ -5,11 +5,12 @@
		#' Step 2: train a logistic regression classifier based on the multiscale score measure and retain cells
		#' that may reside in DA regions.
		#'
		#' @param X size N-by-p matrix, input merged dataset of interest after dimension reduction
		#' @param cell.labels size N vector, labels for each input cell
		#' @param labels.1 vector, label name(s) that represent condition 1
		#' @param labels.2 vector, label name(s) that represent condition 2
		#' @param X size N-by-p matrix, input merged dataset of interest after dimension reduction.
		#' @param cell.labels size N character vector, labels for each input cell
		#' @param labels.1 character vector, label name(s) that represent condition 1
		#' @param labels.2 character vector, label name(s) that represent condition 2
		#' @param k.vector vector, k values to create the score vector
		#' @param alpha numeric, elasticnet mixing parameter passed to glmnet(), default 0 (Ridge)
		#' @param k.folds integer, number of data splits used in the neural network, default 10
		#' @param n.runs integer, number of times to run the neural network to get the predictions, default 10
		#' @param pred.thres length-2 vector, top and bottom threshold on the predictions from the
		@@ -18,18 +19,17 @@
		#' default True
		#' @param plot.embedding size N-by-2 matrix, 2D embedding for the cells
		#' @param size cell size to use in the plot, default 0.5
		#' @param python.use character string, the Python to use, default "/usr/bin/python"
		# @param source.code character string, the neural network source code, default "./DA_nn.py"
		#' @param GPU which GPU to use, default '', using CPU
		#'
		#' @import reticulate
		#' @import RANN
		#' @importFrom caret createFolds
		#' @import glmnet
		#'
		#' @return a list of results
		#' \describe{
		#' \item{da.ratio}{score vector for each cell}
		#' \item{da.pred}{(mean) prediction from the neural network}
		#' \item{da.cell.id}{index for DA cells}
		#' \item{da.up}{index for DA cells more abundant in condition of labels.2}
		#' \item{da.down}{index for DA cells more abundant in condition of labels.1}
		#' \item{pred.plot}{ggplot object showing the predictions of logistic regression on plot.embedding}
		#' \item{da.cells.plot}{ggplot object highlighting cells of da.cell.idx on plot.embedding}
		#' }
		@@ -37,79 +37,52 @@
		#' @export

		getDAcells <- function(
		X, cell.labels, labels.1, labels.2, k.vector,
		k.folds = 10, n.runs = 10, pred.thres = c(0.05,0.95),
		do.plot = T, plot.embedding = NULL, size = 0.5,
		python.use = "/usr/bin/python", GPU = ""
		X, cell.labels, labels.1, labels.2, k.vector = NULL, save.knn = F,
		alpha = 0, k.folds = 10, n.runs = 10, pred.thres = c(0.05,0.95),
		do.plot = T, plot.embedding = NULL, size = 0.5
		){
		# cat("Using GPU ", GPU, ".\n", sep = "")
		if(!inherits(x = X, what = "matrix")){
		cat("Turning X to a matrix.\n")
		X <- as.matrix(X)
		}

		# check label input
		if(!inherits(cell.labels, "character") \|
		!inherits(labels.1, "character") \| !inherits(labels.2, "character")){
		stop("Input parameters cell.labels, labels.1 and labels.2 must be character")
		}
		if(length(setdiff(cell.labels, c(labels.1, labels.2))) > 0){
		stop("Input parameter cell.labels contain labels not from labels.1 or labels.2")
		}
		n.cells <- length(cell.labels)

		# get DA score vector for each cell
		cat("Calculating DA score vector.\n")
		X.knn.ratio <- daPerCell(
		X.knn.result <- daPerCell(
		X = X,
		cell.labels = cell.labels,
		labels.1 = labels.1,
		labels.2 = labels.2,
		k.vector = k.vector
		)
		X.knn.ratio <- X.knn.result[["ratio"]]
		X.knn.graph <- X.knn.result[["graph"]]


		# prepare data for python script
		use_python(python = python.use, required = T)

		# GLM
		cat("Running GLM.\n")
		binary.labels <- cell.labels
		binary.labels[cell.labels %in% labels.1] <- 0.0
		binary.labels[cell.labels %in% labels.2] <- 1.0

		binary.labels_py <- r_to_py(as.matrix(binary.labels))
		X.knn.ratio_py <- r_to_py(as.matrix(X.knn.ratio))

		# get neural network predictions for each cell
		cat("Running logistic regression.\n")
		source_python(file = paste(system.file(package="DAseq"), "DA_logit.py", sep = "/"))
		# py_run_string(paste("epochs = ", epochs, sep = ""))
		py_run_string(paste("k_folds = ", k.folds, sep = ""))

		# set GPU device
		py_run_string(paste("os.environ['CUDA_VISIBLE_DEVICES'] = '", GPU, "'", sep = ""))


		# check n.runs
		if(n.runs == 1){
		X.pred <- k_fold_predict_linear(X.knn.ratio_py, binary.labels_py, py$k_folds)
		# X.pred <- logi_predict(X.knn.ratio_py, binary.labels_py, py$epochs)
		# if(linear){
		# X.pred <- k_fold_predict_linear(X.knn.ratio_py, binary.labels_py, py$k_folds)
		# } else {
		# X.pred <- k_fold_predict(X.knn.ratio_py, binary.labels_py, py$k_folds)
		# }
		X.pred <- as.numeric(X.pred)
		X.std <- NULL
		} else {
		cat("Running ", n.runs, " runs.\n", sep = "")
		X.pred.all <- list()
		for(ii in 1:n.runs){
		X.pred.all[[ii]] <- as.numeric(k_fold_predict_linear(X.knn.ratio_py, binary.labels_py, py$k_folds))
		# X.pred.all[[ii]] <- as.numeric(logi_predict(X.knn.ratio_py, binary.labels_py, py$epochs))
		# if(linear){
		# X.pred.all[[ii]] <- as.numeric(k_fold_predict_linear(X.knn.ratio_py, binary.labels_py, py$k_folds))
		# } else {
		# X.pred.all[[ii]] <- as.numeric(k_fold_predict(X.knn.ratio_py, binary.labels_py, py$k_folds))
		# }
		}
		X.pred.all <- do.call("rbind", X.pred.all)
		X.pred <- colMeans(X.pred.all)
		X.std <- apply(X.pred.all, 2, sd)
		}

		X.pred <- runDAlasso(
		X = X.knn.ratio, y = factor(binary.labels),
		k.folds = k.folds, n.runs = n.runs, alpha = alpha
		)

		# select DA cells
		pred.thres <- sort(pred.thres, decreasing = F)
		X.da.idx <- which(
		X.pred < quantile(X.pred, pred.thres[1]) \| X.pred > quantile(X.pred, pred.thres[2])
		)

		X.da.up <- which(X.pred > quantile(X.pred, pred.thres[2]))
		X.da.down <- which(X.pred < quantile(X.pred, pred.thres[1]))

		# plot results
		if(do.plot & is.null(plot.embedding)){
		@@ -123,8 +96,7 @@ getDAcells <- function(
		X.da.cells.plot <- plotDAsite(
		X = plot.embedding,
		site.list = list(
		which(X.pred < quantile(X.pred, pred.thres[1])),
		which(X.pred > quantile(X.pred, pred.thres[2]))
		X.da.down, X.da.up
		),
		size = size, cols = c("blue","red")
		)
		@@ -134,42 +106,89 @@ getDAcells <- function(
		}

		# return result
		if(save.knn){
		return(list(
		knn.graph = X.knn.graph,
		da.ratio = X.knn.ratio,
		da.pred = X.pred,
		da.up = X.da.up,
		da.down = X.da.down,
		# da.std = X.std,
		# da.cell.idx = X.da.idx,
		pred.plot = X.pred.plot,
		da.cells.plot = X.da.cells.plot
		))
		} else {
		return(list(
		da.ratio = X.knn.ratio,
		da.pred = X.pred,
		da.up = X.da.up,
		da.down = X.da.down,
		# da.std = X.std,
		da.cell.idx = X.da.idx,
		# da.cell.idx = X.da.idx,
		pred.plot = X.pred.plot,
		da.cells.plot = X.da.cells.plot
		))
		}
		}



		#' Update DA cells
		#'
		#' Use different threshold to select DA cells based on an output from getDAcells()
		#' Use different threshold to select DA cells based on an output from getDAcells().
		#'
		#' @param X output from getDAcells()
		#' @param pred.thres length-2 vector, top and bottom threshold on the predictions from the
		#' logistic classification, default c(0.05,0.95)
		#' @param alpha set this parameter to not NULL to rerun Logistic regression
		#' @param do.plot a logical value to indicate whether to return ggplot objects showing the results,
		#' default True
		#' @param plot.embedding size N-by-2 matrix, 2D embedding for the cells
		#' @param size cell size to use in the plot, default 0.5

		#'
		#' @return a list of results with updated DA cells
		#' @export

		updateDAcells <- function(
		X, pred.thres = c(0.05,0.95), do.plot = T, plot.embedding = NULL, size = 0.5
		X, pred.thres = c(0.05,0.95),
		alpha = NULL, k.folds = 10, n.runs = 10,
		cell.labels = NULL, labels.1 = NULL, labels.2 = NULL,
		do.plot = T, plot.embedding = NULL, size = 0.5
		){
		# select DA cells
		n.cells <- length(X$da.pred)

		if(!is.null(alpha) & (is.null(cell.labels) \| is.null(labels.1) \| is.null(labels.2))){
		stop("To update DA cells with new alpha, please also specify cell.labels, labels.1, labels.2.")
		}
		if(!is.null(alpha) & (!is.null(cell.labels) & !is.null(labels.1) & !is.null(labels.2))){
		# check label input
		if(!inherits(cell.labels, "character") \|
		!inherits(labels.1, "character") \| !inherits(labels.2, "character")){
		stop("Input parameters cell.labels, labels.1 and labels.2 must be character")
		}
		if(length(setdiff(cell.labels, c(labels.1, labels.2))) > 0){
		stop("Input parameter cell.labels contain labels not from labels.1 or labels.2")
		}

		binary.labels <- cell.labels
		binary.labels[cell.labels %in% labels.1] <- 0.0
		binary.labels[cell.labels %in% labels.2] <- 1.0

		X.pred <- runDAlasso(
		X = X$da.ratio, y = factor(binary.labels),
		k.folds = k.folds, n.runs = n.runs, alpha = alpha
		)
		}
		if(is.null(alpha)){
		X.pred <- X$da.pred
		}

		# select DA cells based on new pred.thres
		pred.thres <- sort(pred.thres, decreasing = F)
		X.da.idx <- which(
		X.pred < quantile(X.pred, pred.thres[1]) \| X.pred > quantile(X.pred, pred.thres[2])
		)
		X.da.up <- which(X.pred > quantile(X.pred, pred.thres[2]))
		X.da.down <- which(X.pred < quantile(X.pred, pred.thres[1]))

		# plot results
		if(do.plot & is.null(plot.embedding)){
		@@ -183,8 +202,7 @@ updateDAcells <- function(
		X.da.cells.plot <- plotDAsite(
		X = plot.embedding,
		site.list = list(
		which(X.pred < quantile(X.pred, pred.thres[1])),
		which(X.pred > quantile(X.pred, pred.thres[2]))
		X.da.down, X.da.up
		),
		size = size, cols = c("blue","red")
		)
		@@ -194,13 +212,12 @@ updateDAcells <- function(
		}

		# return result
		return(list(
		da.ratio = X$da.ratio,
		da.pred = X.pred,
		da.cell.idx = X.da.idx,
		pred.plot = X.pred.plot,
		da.cells.plot = X.da.cells.plot
		))
		X$da.pred <- X.pred
		X$da.up <- X.da.up
		X$da.down <- X.da.down
		X$pred.plot <- X.pred.plot
		X$da.cells.plot <- X.da.cells.plot
		return(X)
		}


		@@ -213,29 +230,53 @@ daPerCell <- function(
		knn.graph <- knn.out$nn.idx

		n.cells <- length(cell.labels)
		knn.diff.ratio <- matrix(0, nrow = n.cells, ncol = length(k.vector))
		colnames(knn.diff.ratio) <- as.character(k.vector)
		n.k <- length(k.vector)
		# knn.diff.ratio <- matrix(0, nrow = n.cells, ncol = length(k.vector))

		labels.1 <- labels.1[labels.1 %in% cell.labels]
		labels.2 <- labels.2[labels.2 %in% cell.labels]

		cell.label.name <- sort(unique(cell.labels))
		cell.label.tab <- table(factor(cell.labels, levels = cell.label.name))
		cell.labels.bin <- cell.labels
		cell.labels.bin[cell.labels %in% labels.1] <- 0
		cell.labels.bin[cell.labels %in% labels.2] <- 1
		cell.labels.bin <- as.numeric(cell.labels.bin)
		n.label.2 <- sum(cell.labels.bin)
		n.label.1 <- n.cells - n.label.2

		# count labels
		for(ii in 1:n.cells){
		for(kk in k.vector){
		i.kk.label <- cell.labels[knn.graph[ii,1:kk]]
		i.kk.ratio1 <- sum(i.kk.label %in% labels.1) / sum(cell.labels %in% labels.1)
		i.kk.ratio2 <- sum(i.kk.label %in% labels.2) / sum(cell.labels %in% labels.2)
		knn.diff.ratio[ii,as.character(kk)] <- (i.kk.ratio2 - i.kk.ratio1) / (i.kk.ratio2 + i.kk.ratio1)
		# i.kk.label.ratio <- table(factor(i.kk.label, levels = cell.label.name)) / cell.label.tab
		# knn.diff.ratio[ii,as.character(kk)] <-
		# (mean(i.kk.label.ratio[labels.2]) - mean(i.kk.label.ratio[labels.1])) /
		# sum(i.kk.label.ratio)
		# (mean(i.kk.label.ratio[labels.2]) + mean(i.kk.label.ratio[labels.1]))
		}
		knn.diff.ratio <- matrix(unlist(lapply(seq_len(n.cells), function(ii) lapply(k.vector, function(kk){
		i.kk.label <- cell.labels.bin[knn.graph[ii,1:kk]]
		i.kk.ratio1 <- (kk - sum(i.kk.label)) / n.label.1
		i.kk.ratio2 <- sum(i.kk.label) / n.label.2
		return((i.kk.ratio2 - i.kk.ratio1) / (i.kk.ratio2 + i.kk.ratio1))
		}))), nrow = n.cells, byrow = T)
		colnames(knn.diff.ratio) <- as.character(k.vector)

		return(list(
		"graph" = knn.graph,
		"ratio" = knn.diff.ratio
		))
		}

		return(knn.diff.ratio)

		# LASSO regression with CV and multiple runs
		runDAlasso <- function(X, y, k.folds = 10, n.runs = 10, alpha = 0){
		#X.data <- data.frame(X, response = as.factor(y))
		n.obs <- length(y)
		X.pred.all <- list()
		for(ii in 1:n.runs){
		set.seed(ii)
		X.pred.all[[ii]] <- rep(0, n.obs)
		X.folds <- createFolds(y = y, k = k.folds)
		for(jj in 1:k.folds){
		X.glm <- cv.glmnet(x = X[-X.folds[[jj]],], y = y[-X.folds[[jj]]], family = "binomial", alpha = alpha)
		X.glm.pred <- predict(
		object = X.glm, newx = X[X.folds[[jj]],], s = "lambda.1se", type = "response"
		)
		X.pred.all[[ii]][X.folds[[jj]]] <- X.glm.pred
		}
		}
		X.pred.all <- do.call("rbind", X.pred.all)
		X.pred <- colMeans(X.pred.all)
		return(X.pred)
		}

Admin message