More documentation and finalize first pipeline steps. (17925e4a) · Commits · github_fork / DTUrtle

DESCRIPTION

+2 −2

Original line number	Diff line number	Diff line
		@@ -9,7 +9,7 @@ Description: More about what it does (maybe more than one line)
		License: What license is it under?
		Encoding: UTF-8
		LazyData: true
		Depends: tximport, DRIMSeq, stageR, GenomicFeatures, assertthat
		Imports: matrixStats
		Depends: R (>= 3.3.0)
		Imports: tximport, DRIMSeq, stageR, matrixStats, GenomicFeatures, formattable, DT, Gviz, assertthat, Matrix
		RoxygenNote: 7.1.0
		Roxygen: list(markdown = TRUE)

NAMESPACE

+4 −0

Original line number	Diff line number	Diff line
		@@ -4,5 +4,9 @@ import('GenomicFeatures')
		import('tximport')
		import('DRIMSeq')
		import('stageR')
		import('formattable')
		import('DT')
		import('Gviz')
		import('assertthat')
		importFrom('matrixStats', 'rowVars')
		importFrom('Matrix', 'summary')

R/dtu_analysis.R

+150 −68

Original line number	Diff line number	Diff line


		#TODO: add bulk option
		#' Title
		#TODO: Refer to combination of sc data matrix
		#' Import the quantification results of many RNAseq quantifiers, including 'alevin' for single-cell data.
		#'
		#' @param path
		#' @param ...
		#' Most likely the first step in your DTUrtle analysis.
		#' Can perform multiple scaling schemes, defaults to scaling schemes appropriate for DTU analysis.
		#'
		#' @return
		#' @param files Vector of files to be imported. Optionally can be named to keep the samples names.
		#' @param type Type of the quantification data. All tools supported by tximport can be selected. If you have single-cell data, the use of `alevin` is proposed.
		#' - `'salmon'`
		#' - `'alevin'`
		#' - `'kallisto'`
		#' - `'rsem'`
		#' - `'stringtie'`
		#' - `'sailfish'`
		#' - `'none'`
		#' @param ... Further parameters to the specific tximport call. See `tximport()` for available parameters.
		#'
		#' @return - For bulk data: A combined count matrix for all specified samples.
		#' - For single-cell data (`type='alevin'`): A list of count matrices per sample. Can be combined and optionally added to a Seurat object with TODO.
		#' @export
		#'
		#' @examples
		readin_alevin <- function(path, ...){
		message("Read in ", length(path), " alevin runs.")
		import_counts <- function(files, type, ...){
		assert_that(type %in% c("salmon", "alevin", "kallisto", "rsem", "stringtie", "sailfish", "none"))
		message("Reading in ", length(files), " ", type, " runs.")

		args=list(...)

		if(type=="alevin"){
		return_obj = list()
		for(i in path){
		browser()
		#temp <- tximport:::readAlevin(files=i, dropInfReps=F, forceSlow=F, ...)
		return_obj <- append(return_obj, tximport(files = i, type = "alevin", ...))

		if(hasArg("countsFromAbundance")){
		warning("\nImport of alevin files currently does not support using scaling methods.\nPlease note, that in tagged-end single-cell protocols (like 10X chromium) it is assumed\nthat there is no length effect in the fragment generation process - thus making a scaling unnecessary.")
		}

		if(!is.null(names(path))){
		names(return_obj) <- names(path)
		for(i in files){
		return_obj <- append(return_obj, tximport(files = i, type = "alevin", ...)$counts)
		}
		if(!is.null(names(files))){
		names(return_obj) <- names(files)
		}
		return(return_obj)
		}else{

		if(hasArg("countsFromAbundance")){
		if(!args$countsFromAbundance %in% c("dtuScaledTPM", "scaledTPM")){
		warning("It is recommended to use the 'countsFromAbundance' scaling schemes 'dtuScaledTPM' or 'scaledTPM',\nto correct for increased counts of longer transcripts.\nIf you are using a tagged-end protocol, the use 'no' is suggested.")
		}
		}else{
		if(hasArg("tx2gene")){
		message("Using 'dtuScaledTPM' for 'countsFromAbundance'.")
		args$countsFromAbundance <- "dtuScaledTPM"
		}else{
		message("Using 'scaledTPM' for 'countsFromAbundance'. If you specify a 'tx2gene' file, 'dtuScaledTPM' can be used.")
		args$countsFromAbundance <- "scaledTPM"
		}
		}
		if(hasArg(txOut)){
		if(args$txOut==F){
		warning("Unless you exactly know what you are doing it is not recommended to set txOut to FALSE.\nDownstream analysis may fail!")
		}
		}else{
		args$txOut <- T
		}
		args$files <- files
		args$type <- type
		return(do.call(tximport, args)$counts)
		}
		}


		#TODO: Check if seurat_obj is seurat object!
		#' Title
		#TODO: combine to matrix without Seurat! (same extension scheme)
		#' Combine list of sparse transcription count matrices.
		#'
		#' @param tx_mat
		#' @param seurat_obj
		#' Only needed when dealing with single-cell data. Adds a cellname extension if necessary.
		#' Can optionally add the combined matrix to a existing Seurat object (keeping the cellname extension of the object and matching the cells).
		#'
		#' @return
		#' @param tx_list List of sparse transcription count matrices, as returned by `import_counts()` for single-cell data.
		#' @param cell_extensions Optional list of cellname extensions that are added to the cellnames of one sample. The cellnames and the extension are separated by an underscore '_'.
		#' @param seurat_obj Optional seurat object, where the combined matrix is added as an assay. This has the advantage, that the cells are matched and subsetted if necessary. Currently only Seurat 3 objects are supported.
		#' @param assay_name If the combined matrix should be added to an existing Seurat object, the name of the assay can be specified here.
		#'
		#' @return Either a combined sparse transcription count matrix or a seurat object which the combined sparse transcription count matrix as an assay.
		#' @export
		#'
		#' @examples
		add_to_seurat <- function(tx_mat, seurat_obj){
		if(!is.list(tx_mat)){
		tx_mat <- list(tx_mat)
		combine_to_matrix <- function(tx_list, cell_extensions=NULL, seurat_obj=NULL, assay_name="dtutx"){

		if(!is.null(seurat_obj)){
		assert_that(require("Seurat", character.only = T), msg = "The package Seurat is needed for adding the combined matrix to a seurat object!")
		assert_that(packageVersion("Seurat")>="3.0.0", msg = "At least Version 3 of Seurat is needed. Currently only Seurat 3 objects are supported!")
		#TODO: check if seurat_obj is seurat object!
		assert_that(is.string(assay_name))
		}

		if(!is.list(tx_list)){
		tx_list <- list(tx_list)
		}

		if(is.null(names(tx_mat))\|length(unique(names(tx_mat)))!=length(tx_mat)){
		names(tx_mat) <- paste0("obj_",seq_along(tx_mat))
		if(!all(as.logical(lapply(tx_list, FUN = function(x) is(x, 'sparseMatrix'))))){
		stop("Your 'tx_list' object contains non sparseMatrix elements.")
		}

		if(!all(as.logical(lapply(tx_mat, FUN = function(x) is(x, 'sparseMatrix'))))){
		stop("Your tx_mat object contains non sparseMatrix elements.")
		if(is.null(names(tx_list))\|length(unique(names(tx_list)))!=length(tx_list)){
		names(tx_list) <- paste0("obj_",seq_along(tx_list))
		}

		#map cell extensions
		if(!is.null(cell_extensions)){
		assert_that(are_equal(length(cell_extensions), length(tx_list)), msg="cell_extensions must have same length as tx_list!")
		}

		dup <- any(duplicated(unlist(lapply(tx_list, FUN = colnames))))

		if(!is.null(seurat_obj)){
		extensions_in_seur <- sum(grepl(pattern = "_", x = Cells(seurat_obj), fixed = T))>length(Cells(seurat_obj))*0.1
		if(!extensions_in_seur&dup){
		stop("Found no Seurat cellname extension but not unique cellnames. Make your cellnames unique or extend the cellnames in your Seurat object.")
		}
		if(extensions_in_seur){
		message("At least some seurat cellnames show a cellname extension.")
		extension_list <- paste0("_", unique(stringi::stri_split_fixed(str= Cells(seurat_obj), pattern = "_", n = 2, simplify = T)[,2]))
		extension_mapping <- names(tx_mat)
		if(length(extension_list)!=length(extension_mapping)){
		stop(paste0("Could not 1:1 map seurat cellname extensions and tx file list.\n",
		"Try subsetting the seurat object if you do not want to provide tx information of all cells."))
		cell_extensions <- paste0("_", unique(stringi::stri_split_fixed(str= Cells(seurat_obj), pattern = "_", n = 2, simplify = T)[,2]))
		if(length(cell_extensions)!=length(names(tx_list))){
		stop("Could not 1:1 map seurat cellname extensions and tx file list.\n",
		"Try subsetting the seurat object if you do not want to provide tx information for all samples.")
		}
		}
		}
		message("Map extensions:\n\t", paste0(extension_mapping, " --> '", extension_list, "'\n\t"))

		for(i in seq_along(tx_mat)){
		colnames(tx_mat[[i]]) <- paste0(colnames(tx_mat[[i]]), extension_list[[i]])

		if(dup\|!is.null(cell_extensions)){
		if(dup){
		message("Found overall duplicated cellnames. Trying cellname extension per sample.")
		}

		if(is.null(cell_extensions)){
		cell_extensions <- paste0("_", seq_along(tx_list))
		}

		cell_extensions <- paste0(ifelse(startsWith(cell_extensions, "_"), "", "_"), cell_extensions)
		message("Map extensions:\n\t", paste0(names(tx_list), " --> '", cell_extensions, "'\n\t"))

		for(i in seq_along(tx_list)){
		colnames(tx_list[[i]]) <- paste0(colnames(tx_list[[i]]), cell_extensions[[i]])
		}

		all_cells <- unlist(lapply(tx_mat, FUN = colnames))

		all_cells <- unlist(lapply(tx_list, FUN = colnames))
		dup_cells <- all_cells[duplicated(all_cells)]

		if(length(dup_cells)>0){
		stopstr <- ifelse(length(dup_cells)>10, paste0("Found ", length(dup_cells), " duplicated cellnames!"), paste0("Found duplicated cellnames:\n\t", paste0(dup_cells, collapse = "\n\t")))
		stopstr <- ifelse(length(dup_cells)>10, paste0("Found ", length(dup_cells), " duplicated cellnames even after cellname extension!"), paste0("Found duplicated cellnames even after cellname extension:\n\t", paste0(dup_cells, collapse = "\n\t")))
		stop(stopstr)
		}
		message("Merging matrices")
		tx_mat <- merge.sparse(tx_mat)

		common_cells <- intersect(Cells(seurat_obj), colnames(tx_mat))
		uniq_cells <- setdiff(colnames(tx_mat), Cells(seurat_obj))
		extra_seurat_cells <- setdiff(Cells(seurat_obj), colnames(tx_mat))
		message("Of ", ncol(tx_mat), " cells, ", length(common_cells), " (", round(length(common_cells)/ncol(tx_mat)*100), "%) could be found in the provided seurat object.")
		message(length(uniq_cells), " (", round(length(uniq_cells)/ncol(tx_mat)*100),"%) are unique to the tx files.")
		}

		message("Merging matrices")
		tx_list <- merge_sparse(tx_list)

		if(!is.null(seurat_obj)){
		common_cells <- intersect(Cells(seurat_obj), colnames(tx_list))
		uniq_cells <- setdiff(colnames(tx_list), Cells(seurat_obj))
		extra_seurat_cells <- setdiff(Cells(seurat_obj), colnames(tx_list))
		message("Of ", ncol(tx_list), " cells, ", length(common_cells), " (", round(length(common_cells)/ncol(tx_list)*100), "%) could be found in the provided seurat object.")
		message(length(uniq_cells), " (", round(length(uniq_cells)/ncol(tx_list)*100),"%) are unique to the tx files.")
		message("The seurat object contains ", length(extra_seurat_cells), " additional cells.")
		if(length(uniq_cells)>0\|length(extra_seurat_cells)>0){
		message("Subsetting!")
		}

		seurat_obj <- subset(seurat_obj, cells=common_cells)
		tx_mat <- tx_mat[,common_cells]

		message("Adding assay 'dtutx'")
		seurat_obj[["dtutx"]] <- CreateAssayObject(counts=tx_mat)
		tx_list <- tx_list[,common_cells]

		message("Adding assay '",assay_name,"'")
		seurat_obj[[assay_name]] <- CreateAssayObject(counts=tx_list)
		return(seurat_obj)
		}




		else{
		return(tx_list)
		}
		}


		#' Compute the main DRIMSeq results
		@@ -119,11 +201,11 @@ add_to_seurat <- function(tx_mat, seurat_obj){
		#' @param n_core Set the number of CPU cores that should be used in the statistical computations.
		#' @param ... If `filtering_strategy=='own'` specify the wished parameters of `dmFilter()` here.
		#'
		#' @return `DRIM` object with the key results, that can be used in the DTUrtle steps hereafter. The object is just a easily accesible list with the following items:
		#' @return `drim` object with the key results, that can be used in the DTUrtle steps hereafter. The object is just a easily accesible list with the following items:
		#' - `drim`: The results of the DRIMSeq statistical computations (`dmTest()`).
		#' - `design_full`: The design matrix generated from the specified `pd` columns
		#' - `design_full`: The design matrix generated from the specified `pd` columns.
		#' - `cond_levels`: A copy of the specified comparison groups.
		#' - `group`: ?
		#' - `group`: Vector which sample/cell belongs to which comparison group.
		#' - `pct_exp_tx`: Data frame of the expressed-in percentage of all transcripts. [TODO: split by group?]
		#' - `pct_exp_gene`: Data frame of the expressed-in percentage of all genes. [TODO: split by group?]
		#'

R/helper_functions.R

+18 −5

Original line number	Diff line number	Diff line
		@@ -45,17 +45,16 @@ merge_sparse <- function(tx_mat) {
		i <- numeric()
		j <- numeric()

		for (M in tx_mat) {

		cnold <- colnames(M)
		rnold <- rownames(M)
		for (mat in tx_mat) {

		cnold <- colnames(mat)
		rnold <- rownames(mat)
		cnnew <- union(cnnew,cnold)
		rnnew <- union(rnnew,rnold)

		cindnew <- match(cnold,cnnew)
		rindnew <- match(rnold,rnnew)
		ind <- summary(M)
		ind <- Matrix::summary(mat)
		i <- c(i,rindnew[ind[,1]])
		j <- c(j,cindnew[ind[,2]])
		x <- c(x,ind[,3])
		@@ -124,3 +123,17 @@ add_max_delta <- function(dtu_table, dtu){
		dtu_table[[paste0("max(",levels(dtu$group)[1], "-",levels(dtu$group)[2],")")]] <- as.numeric(sapply(dtu_table$geneID, FUN = getmax))
		return(dtu_table)
		}



		#' Title
		#'
		#' @param x
		#'
		#' @return
		#' @export
		#'
		#' @examples
		rm_tx_version <- function(x){
		return(sub("\\..*", "", x))
		}

R/visualization.R

+565 −0

File changed.

Preview size limit exceeded, changes collapsed.

Admin message