update to isoseq tutorial (287f46a1) · Commits · github_fork / HdWGCNA

R/FindMajorIsoforms.R

+383 −16

Original line number	Diff line number	Diff line

		#' FindMajorIsoforms
		#'
		#' Finds the set of "major isoforms" for each gene in each cell population. The set of
		#' major isoforms consists of the isoforms accounting for a desired proportion of a gene's
		#' expression. Pseudobulk replicates in each sample and cell population are used for this calculation.
		#'
		#' @return a list of major isoforms for each cell population
		#'
		#' @param seurat_obj A Seurat object
		#' @param group.by column in seurat_obj@meta.data containing grouping info, ie clusters or celltypes
		#' @param replicate_col column in seurat_obj@meta.data denoting each replicate / sample
		#' @param isoform_delim
		#' @param proportion_thresh desired proportion of expression to define the set of major isoforms. Default = 0.8.
		#' @param low_thresh lower bound of expression level for considering an isoform as part of the major isoform set.
		#' @param assay Assay in seurat_obj containing isoform expression information.
		#' @param slot Slot in seurat_obj, default to counts slot.
		#' @param cluster_markers Cell population marker gene table from Seurat FindAllMarkers for the same cell populations specified in group.by. Optional parameter, will exclude isoforms that are not from marker genes.
		#' @param wgcna_name The name of the hdWGCNA experiment in the seurat_obj@misc slot
		#' @details
		#' FindMajorIsoforms computes the set of major isoforms in a given Seurat object that contains isoform-level
		#' expression information. First, pseudobulk replicates are computed for the given cell populations and samples
		#' present in the Seurat object. For each gene in each cell population, we rank the gene's isoforms by expression
		#' level and take the top expressing isoforms that make up the desired proportion of the gene's total expression,
		#' making sure to exclude any very lowly expressed isoforms.
		#'
		#' Optionally, the user may supply a marker gene table for each cell population (formatted like the output of Seurat
		#' FindAllMarkers), and then the algorithm will only return major isoforms of the given marker genes.
		#' @import Seurat
		#' @import Matrix
		#' @export
		FindMajorIsoforms <- function(
		seurat_obj,
		group.by,
		@@ -8,6 +37,7 @@ FindMajorIsoforms <- function(
		low_thresh = 25,
		assay = 'iso',
		slot = 'counts',
		cluster_markers = NULL,
		wgcna_name = NULL
		){

		@@ -42,16 +72,26 @@ FindMajorIsoforms <- function(
		stop('DefaultAssay(seurat_obj) is not ', assay, ' please switch the default assay to the desired assay before running this function.')
		}

		# TODO: add checks for this step?
		# check the marker gene table:
		if(!is.null(cluster_markers)){
		# check columns
		if(!all(c("p_val", 'avg_log2FC', 'pct.1', 'pct.2', 'p_val_adj', 'cluster', 'gene') %in% colnames(cluster_markers))){
		stop('Invalid format for cluster_markers table. Expecting a table from Seurat FindAllMarkers, must have the following columns present in the table: p_val, avg_log2FC, pct.1, pct.2, p_val_adj, cluster, gene.')
		}

		# check the clusters:
		if(!all(as.character(unique(seurat_obj@meta.data[,group.by])) %in% as.character(unique(cluster_markers$cluster)))){
		stop('Groups in group.by must also be present in cluster_markers$cluster')
		}
		}

		# create dataframe to map isoforms to gene names
		if(is.null(iso_df)){
		iso_names <- rownames(seurat_obj)
		gene_names <- do.call(rbind, strsplit(iso_names, iso_delim))[,1]
		gene_names <- do.call(rbind, strsplit(iso_names, isoform_delim))[,1]
		iso_df <- data.frame(
		iso = iso_names,
		gene = gene_names
		)
		}
		if(max(table(iso_df$iso)) > 1){
		stop('Each isoform must only appear once in iso_df. There is likely an issue with the isoform_delim.')
		}
		@@ -60,20 +100,27 @@ FindMajorIsoforms <- function(
		X <- Seurat::GetAssayData(seurat_obj, assay=assay, slot=slot)

		# get pseudobulk replicates
		pseudobulks <- Libra::to_pseudobulk(
		message("Constructing pseudobulks...")
		pseudobulks <- to_pseudobulk(
		X, meta = seurat_obj@meta.data,
		cell_type_col = group.by,
		replicate_col = replicate_col,
		label_col = replicate_col,
		min_reps=0
		)
		message("Done!")

		# initialize progress bar
		message("Finding major isoforms...")
		pb <- utils::txtProgressBar(min = 0, max = length(pseudobulks) ,style = 3, width = 50, char = "=")
		pb_counter <- 1

		# loop through each group to find the major isoforms
		major_list <- list()
		for(cur_group in names(pseudobulks)){
		print(cur_group)
		#print(cur_group)
		cur_pb <- pseudobulks[[cur_group]]
		cur_pb_genes <- do.call(rbind, strsplit(rownames(cur_pb), iso_delim))[,1]
		cur_pb_genes <- do.call(rbind, strsplit(rownames(cur_pb), isoform_delim))[,1]

		# loop through each gene:
		major_isos <- sapply(unique(iso_df$gene), function(cur_gene){
		@@ -82,8 +129,11 @@ FindMajorIsoforms <- function(
		return(cur_iso_df$iso)
		}

		# get the pseudobulk expression for all the isoforms in this gene
		cur_iso_pb <- cur_pb[cur_pb_genes == cur_gene,]
		cur_iso_sum <- rowSums(cur_iso_pb)

		# compute the proportion of the gene's total expression in each isoform
		cur_iso_prop <- cur_iso_sum / sum(cur_iso_sum)

		# don't consider isoforms with very few counts
		@@ -108,8 +158,325 @@ FindMajorIsoforms <- function(
		})
		major_list[[cur_group]] <- as.character(unlist(major_isos))

		# update progress bar
		setTxtProgressBar(pb, pb_counter)
		pb_counter <- pb_counter+1

		}

		# close progress bar
		close(pb)
		message("Done!")

		# return here if the cluster marker table isn't specified
		if(is.null(cluster_markers)){
		return(major_list)
		}

		# intersect with the markers table if it's provided:
		major_marker_list <- lapply(names(major_list), function(cur_group){
		cur_isos <- major_list[[cur_group]]
		cur_genes <- unique(do.call(rbind, strsplit(cur_isos, '[.]'))[,1])
		cur_genes <- subset(cluster_markers, cluster == cur_group & gene %in% cur_genes) %>% .$gene
		subset(iso_df, gene %in% cur_genes & iso %in% cur_isos) %>% .$iso
		})

		# return major isoforms intersected with the marker gene table
		list('major' = major_list, 'marker' = major_marker_list)

		}




		#' Create a pseudobulk matrix
		#'
		#' Convert a single-cell expression matrix (i.e., genes by cells)
		#' to a pseudobulk matrix by summarizing counts within biological replicates.
		#' This function is
		#'
		#' @param input a single-cell matrix to be converted, with features (genes) in rows
		#' and cells in columns. Alternatively, a \code{Seurat}, \code{monocole3}, or
		#' or \code{SingleCellExperiment} object can be directly input.
		#' @param meta the accompanying meta data whereby the rownames match the column
		#' names of \code{input}.
		#' @param replicate_col the vector in \code{meta} containing the replicate
		#' information. Defaults to \code{replicate}.
		#' @param cell_type_col the vector in \code{meta} containing the cell type
		#' information. Defaults to \code{cell_type}.
		#' @param label_col the vector in \code{meta} containing the experimental
		#' label. Defaults to \code{label}.
		#' @param min_cells the minimum number of cells in a cell type to retain it.
		#' Defaults to \code{3}.
		#' @param min_reps the minimum number of replicates in a cell type to retain it.
		#' Defaults to \code{2}.
		#' @param min_features the minimum number of expressing cells (or replicates)
		#' for a gene to retain it. Defaults to \code{0}.
		#' @return a list of pseudobulk matrices, for each cell type.
		#'
		#' @importFrom magrittr %<>% extract
		#' @importFrom dplyr %>% rename_ count group_by filter pull n_distinct distinct
		#' summarise
		#' @importFrom purrr map map_int
		#' @importFrom Matrix rowSums colSums
		#' @importFrom stats setNames
		to_pseudobulk = function(input,
		meta = NULL,
		replicate_col = 'replicate',
		cell_type_col = 'cell_type',
		label_col = 'label',
		min_cells = 3,
		min_reps = 2,
		min_features = 0,
		external = T) {
		if (external) {
		# first, make sure inputs are correct
		inputs = Libra:::check_inputs(
		input,
		meta = meta,
		replicate_col = replicate_col,
		cell_type_col = cell_type_col,
		label_col = label_col)
		expr = inputs$expr
		meta = inputs$meta
		} else {
		expr = input
		}

		# convert to characters
		meta %<>% mutate(replicate = as.character(replicate),
		cell_type = as.character(cell_type),
		label = as.character(label))

		# keep only cell types with enough cells
		keep = meta %>%
		dplyr::count(cell_type, label) %>%
		group_by(cell_type) %>%
		filter(all(n >= min_cells)) %>%
		pull(cell_type) %>%
		unique()

		# process data into gene x replicate x cell_type matrices
		pseudobulks = keep %>%
		map( ~ {
		print(.)
		cell_type = .
		meta0 = meta %>% filter(cell_type == !!cell_type)
		expr0 = expr %>% magrittr::extract(, meta0$cell_barcode)
		# catch cell types without replicates or conditions
		if (n_distinct(meta0$label) < 2)
		return(NA)
		replicate_counts = distinct(meta0, label, replicate) %>%
		group_by(label) %>%
		summarise(replicates = n_distinct(replicate)) %>%
		pull(replicates)
		if (any(replicate_counts < min_reps))
		return(NA)

		# process data into gene X replicate X cell_type matrice
		mm = model.matrix(~ 0 + replicate:label, data = meta0)
		mat_mm = expr0 %*% mm
		keep_genes = rowSums(mat_mm > 0) >= min_features
		mat_mm = mat_mm[keep_genes, ] %>% as.data.frame()
		mat_mm %<>% as.data.frame()
		colnames(mat_mm) = gsub("replicate\|label", "", colnames(mat_mm))
		# drop empty columns
		keep_samples = colSums(mat_mm) > 0
		mat_mm %<>% magrittr::extract(, keep_samples)
		return(mat_mm)
		}) %>%
		setNames(keep)

		# drop NAs
		pseudobulks %<>% magrittr::extract(!is.na(.))

		# also filter out cell types with no retained genes
		min_dim = map(pseudobulks, as.data.frame) %>% map(nrow)
		pseudobulks %<>% magrittr::extract(min_dim > 1)

		# also filter out types without replicates
		min_repl = map_int(pseudobulks, ~ {
		# make sure we have a data frame a not a vector
		tmp = as.data.frame(.)
		targets = data.frame(group_sample = colnames(tmp)) %>%
		mutate(group = gsub(".*\\:", "", group_sample))
		if (n_distinct(targets$group) == 1)
		return(as.integer(0))
		min(table(targets$group))
		})
		pseudobulks %<>% magrittr::extract(min_repl >= min_reps)
		return(pseudobulks)
		}

		major_list
		#' Check inputs
		#'
		#' Check inputs prior to running to_pseudobulk
		#'
		#'
		#' @param input a single-cell matrix to be converted, with features (genes) in rows
		#' and cells in columns. Alternatively, a \code{Seurat}, \code{monocole3}, or
		#' or \code{SingleCellExperiment} object can be directly input.
		#' @param meta the accompanying meta data whereby the rownames match the column
		#' names of \code{input}.
		#' @param replicate_col the vector in \code{meta} containing the replicate
		#' information. Defaults to \code{replicate}.
		#' @param cell_type_col the vector in \code{meta} containing the cell type
		#' information. Defaults to \code{cell_type}.
		#' @param label_col the vector in \code{meta} containing the experimental
		#' label. Defaults to \code{label}.
		#' @param min_cells the minimum number of cells in a cell type to retain it.
		#' Defaults to \code{3}.
		#' @param min_reps the minimum number of replicates in a cell type to retain it.
		#' Defaults to \code{2}.
		#' @param min_features the minimum number of expressing cells (or replicates)
		#' for a gene to retain it. Defaults to \code{0}.
		#' @return a cleaned up expression matrix and meta data object
		#'
		#' @importFrom dplyr %>% rename_ n_distinct mutate_at vars
		#' @importFrom magrittr %<>%
		#' @importFrom tester is_numeric_matrix is_numeric_dataframe
		#' @importFrom methods is
		#'
		check_inputs = function(input,
		meta = meta,
		replicate_col = 'replicate',
		cell_type_col = 'cell_type',
		label_col = 'label') {

		# extract cell types and label from metadata
		if ("Seurat" %in% class(input)) {
		# confirm Seurat is installed
		if (!requireNamespace("Seurat", quietly = TRUE)) {
		stop("install \"Seurat\" R package for Augur compatibility with ",
		"input Seurat object", call. = FALSE)
		}
		meta = input@meta.data %>%
		droplevels()
		if (!is.null(replicate_col))
		replicates = as.character(meta[[replicate_col]])
		if (!is.factor(meta[[label_col]])) {
		labels = meta[[label_col]]
		} else {
		labels = as.character(meta[[label_col]])
		}
		cell_types = as.character(meta[[cell_type_col]])
		expr = Seurat::GetAssayData(input, slot = 'counts')
		} else if ("cell_data_set" %in% class(input)) {
		# confirm monocle3 is installed
		if (!requireNamespace("monocle3", quietly = TRUE)) {
		stop("install \"monocle3\" R package for Augur compatibility with ",
		"input monocle3 object", call. = FALSE)
		}
		meta = monocle3::pData(input) %>%
		droplevels() %>%
		as.data.frame()
		if (!is.null(replicate_col))
		replicates = as.character(meta[[replicate_col]])
		if (!is.factor(meta[[label_col]])) {
		labels = meta[[label_col]]
		} else {
		labels = as.character(meta[[label_col]])
		}
		cell_types = as.character(meta[[cell_type_col]])
		expr = monocle3::exprs(input)
		} else if ("SingleCellExperiment" %in% class(input)){
		# confirm SingleCellExperiment is installed
		if (!requireNamespace("SingleCellExperiment", quietly = TRUE)) {
		stop("install \"SingleCellExperiment\" R package for Augur ",
		"compatibility with input SingleCellExperiment object",
		call. = FALSE)
		}
		meta = SummarizedExperiment::colData(input) %>%
		droplevels() %>%
		as.data.frame()
		if (!is.null(replicate_col))
		replicates = as.character(meta[[replicate_col]])
		if (!is.factor(meta[[label_col]])) {
		labels = meta[[label_col]]
		} else {
		labels = as.character(meta[[label_col]])
		}
		cell_types = as.character(meta[[cell_type_col]])
		expr = SummarizedExperiment::assay(input)
		} else {
		# check if input is sparse matrix or numberic matrix/df
		valid_input = is(input, 'sparseMatrix') \|\|
		is_numeric_matrix(input) \|\|
		is_numeric_dataframe(input)
		if (!valid_input)
		stop("input must be Seurat, monocle, sparse matrix, numeric matrix, or ",
		"numeric data frame")
		if (is.null(meta))
		stop("input matrix must be accompanied by a metadata table")
		expr = input
		if (!is.null(replicate_col))
		replicates = as.character(meta[[replicate_col]])
		labels = as.character(meta[[label_col]])
		cell_types = as.character(meta[[cell_type_col]])
		}

		# check dimensions are non-zero
		if (length(dim(expr)) != 2 \|\| !all(dim(expr) > 0)) {
		stop("expression matrix has at least one dimension of size zero")
		}

		# check dimensions match
		n_cells1 = nrow(meta)
		n_cells2 = ncol(expr)
		if (n_cells1 != n_cells2) {
		stop("number of cells in metadata (", n_cells1, ") does not match number ",
		"of cells in expression (", n_cells2, ")")
		}

		# check at least two labels
		if (n_distinct(labels) == 1) {
		stop("only one label provided: ", unique(labels))
		}

		# check for missing labels or cell types
		if (any(is.na(labels))) {
		stop("labels contain ", sum(is.na(labels)), "missing values")
		}
		if (any(is.na(cell_types))) {
		stop("cell types contain ", sum(is.na(cell_types)), "missing values")
		}
		if (!is.null(replicate_col) && any(is.na(replicates))) {
		stop("replicates contain ", sum(is.na(replicates)), "missing values")
		}

		# check for missing replicates
		if (!is.null(replicate_col) && is.null(replicates)) {
		stop("metadata does not contain replicate information")
		}

		# remove missing values
		missing = is.na(expr)
		if (any(missing)) {
		stop("matrix contains ", sum(missing), "missing values")
		}

		# clean up the meta data
		if (!is.null(replicate_col)) {
		meta %<>% as.data.frame() %>%
		mutate(cell_barcode = rownames(meta),
		replicate = meta[[replicate_col]],
		cell_type = meta[[cell_type_col]],
		label = meta[[label_col]]) %>%
		mutate_at(vars(replicate, cell_type, label), as.factor)
		} else {
		meta %<>% as.data.frame() %>%
		mutate(cell_barcode = rownames(meta),
		cell_type = meta[[cell_type_col]],
		label = meta[[label_col]]) %>%
		mutate_at(vars(cell_type, label), as.factor)
		}

		# make sure meta contains row names and is a data frame
		rownames(meta) = colnames(expr)
		meta = as.data.frame(meta)
		to_return = list(
		expr = expr,
		meta = meta
		)
		return(to_return)
		}
		No newline at end of file

Admin message