Runtime optimized sparse filtering. (813de833) · Commits · github_fork / DTUrtle

DESCRIPTION

+3 −3

Original line number	Diff line number	Diff line
		@@ -6,10 +6,10 @@ Author: Tobias Tekath
		Maintainer: The package maintainer <yourself@somewhere.net>
		Description: More about what it does (maybe more than one line)
		Use four spaces when indenting paragraphs within the Description.
		License: What license is it under?
		License: GPL (>= 3)
		Encoding: UTF-8
		LazyData: true
		Depends: R (>= 3.3.0)
		Imports: tximport, DRIMSeq, stageR, matrixStats, GenomicFeatures, formattable, DT, Gviz, assertthat, Matrix
		Depends: R (>= 3.4.0)
		Imports: tximport, DRIMSeq, stageR, rtracklayer, formattable, DT, Gviz, assertthat, Matrix
		RoxygenNote: 7.1.0
		Roxygen: list(markdown = TRUE)

NAMESPACE

+10 −11

Original line number	Diff line number	Diff line
		exportPattern("^[[:alpha:]]+")
		# Generated by roxygen2: do not edit by hand

		import('GenomicFeatures')
		import('tximport')
		import('DRIMSeq')
		import('stageR')
		import('formattable')
		import('DT')
		import('Gviz')
		import('assertthat')
		importFrom('matrixStats', 'rowVars')
		importFrom('Matrix', 'summary')
		export(combine_to_matrix)
		export(create_dtu_table)
		export(import_counts)
		export(import_gtf)
		export(move_columns_to_front)
		export(posthoc_and_stager)
		export(rm_version)
		export(run_drimseq)
		export(sparse_filter)

R/dtu_analysis.R

+403 −68

File changed.

Preview size limit exceeded, changes collapsed.

R/utility_functions.R

+98 −110

Original line number	Diff line number	Diff line
		#' Title
		#' Replaces NA values by 1.
		#'
		#' @param x
		#' @param x Vector of values (i.e. pvalues).
		#'
		#' @return
		#' @export
		#'
		#' @examples
		#' @return Vector of values with NAs replaced.
		no_na <- function(x){
		return(ifelse(is.na(x), 1, x))
		}


		#TODO: test
		#' Filter out
		#'
		#' @param d
		#' @param filter
		#'
		#' @return
		#' @export
		#'
		#' @examples
		smallProportionSD <- function(d, filter) {
		cts <- as.matrix(subset(counts(d), select=-c(gene_id, feature_id)))
		gene.cts <- rowsum(cts, counts(d)$gene_id)
		total.cts <- gene.cts[match(counts(d)$gene_id, rownames(gene.cts)),]
		browser()
		cts <- as.matrix(subset(DRIMSeq::counts(d), select=-c("gene_id", "feature_id")))
		gene.cts <- rowsum(cts, DRIMSeq::counts(d)$gene_id)
		total.cts <- gene.cts[match(DRIMSeq::counts(d)$gene_id, rownames(gene.cts)),]
		props <- cts/total.cts
		propSD <- sqrt(matrixStats::rowVars(props))
		#propSD <- sqrt(matrixStats::rowVars(props))

		#directly call SD?
		propSD <- sqrt(apply(props,1,var))

		return(propSD < filter)
		}

		#' Title
		#'
		#' @param tx_mat
		#' Merge two or more sparse matrices.
		#'
		#' @return
		#' @export
		#' @param tx_mat List of sparse matrices that shall be merged.
		#'
		#' @examples
		#' @return Sparse matrix, containing the information of all provided matrices.
		merge_sparse <- function(tx_mat) {

		cnnew <- character()
		rnnew <- character()
		x <- vector()
		@@ -46,7 +41,7 @@ merge_sparse <- function(tx_mat) {
		j <- numeric()

		for (mat in tx_mat) {

		if(!is.null(mat)){
		cnold <- colnames(mat)
		rnold <- rownames(mat)
		cnnew <- union(cnnew,cnold)
		@@ -59,134 +54,127 @@ merge_sparse <- function(tx_mat) {
		j <- c(j,cindnew[ind[,2]])
		x <- c(x,ind[,3])
		}
		return(sparseMatrix(i=i,j=j,x=x,dims=c(length(rnnew),length(cnnew)),dimnames=list(rnnew,cnnew)))
		}
		return(Matrix::sparseMatrix(i=i,j=j,x=x,dims=c(length(rnnew),length(cnnew)),dimnames=list(rnnew,cnnew)))
		}


		#' Title
		#' Add tx2gene
		#'
		#' Add a data frame as feature level metadata to the Seurat active assay.
		#'
		#' @param seurat_obj
		#' @param tx2gene
		#'
		#' @return
		#' @export
		#'
		#' @examples
		#' @return Seurat object with added data.
		seurat_add_tx2gene <- function(seurat_obj, tx2gene){
		assay_name <- seurat_obj@active.assay
		order <- rownames(seurat_obj[[assay_name]]@meta.features)
		tx2gene <- tx2gene[match(order, tx2gene[[1]]),]
		assert_that(are_equal(nrow(seurat_obj[[assay_name]]@meta.features), nrow(tx2gene)), msg = "'tx2gene' data frame does not contain information for all transcripts.")
		assertthat::assert_that(nrow(seurat_obj[[assay_name]]@meta.features) == nrow(tx2gene), msg = "'tx2gene' data frame does not contain information for all transcripts.")
		seurat_obj[[assay_name]]@meta.features <- cbind(seurat_obj[[assay_name]]@meta.features,tx2gene)
		return(seurat_obj)
		}


		#' Title
		#TODO: Add more posthoc filters
		#' Posthoc filtering
		#'
		#' @param drim
		#' @param filt
		#' Perform posthoc filtering.
		#'
		#' @return
		#' @export
		#' Sets pvalue and adjusted pvalue of 'filtered' elements to 1.
		#' @param drim
		#' @param filt Threshold
		#'
		#' @examples
		#' @return A filtered `DRIMSeq::results()` data frame.
		run_posthoc <- function(drim, filt){
		res.txp.filt <- DRIMSeq::results(drim, level="feature")
		res_txp_filt <- DRIMSeq::results(drim, level="feature")
		filt <- smallProportionSD(drim, filt)
		res.txp.filt$pvalue[filt] <- 1
		res.txp.filt$adj_pvalue[filt] <- 1
		res_txp_filt$pvalue[filt] <- 1
		res_txp_filt$adj_pvalue[filt] <- 1
		message("Posthoc filtered ", sum(filt, na.rm = TRUE), " transcripts")
		return(res.txp.filt)
		}

		#TODO: reevaluate and switch to message
		#' Title
		#'
		#' @param drim
		#' @param drim_filt
		#'
		#' @return
		#' @export
		#'
		#' @examples
		count_filtered <- function(drim, drim_filt){
		drim_filt_temp <- dmFilter(drim)
		rm_trans <- nrow(drim@counts)-nrow(drim_filt@counts)
		rm_gene <- length(drim@counts@partitioning)-length(drim_filt@counts@partitioning)

		print(paste0("Removed ", rm_trans," (",formatC(((rm_trans)/nrow(drim@counts))*100, 2, format = "f"),"%) of ",nrow(drim@counts)," transcripts due to filters."))
		print(paste0("Removed ", rm_gene," (",formatC(((rm_gene)/length(drim@counts@partitioning))*100, 2, format = "f"),"%) of ",length(drim@counts@partitioning)," genes due to filters."))
		print(paste0("Of the removed transcripts: ", nrow(drim@counts)-nrow(drim_filt_temp@counts)," (",formatC(((nrow(drim@counts)-nrow(drim_filt_temp@counts))/rm_trans)*100, 2, format = "f"),"%) had no expression."))
		print(paste0("This explains ", length(drim@counts@partitioning)-length(drim_filt_temp@counts@partitioning)," (",formatC(((length(drim@counts@partitioning)-length(drim_filt_temp@counts@partitioning))/rm_gene)*100, 2, format = "f"),"%) of the removed genes."))
		print(paste0("--> Proceed with ",length(drim_filt@counts@partitioning)," genes and ",nrow(drim_filt@counts)," transcripts."))
		return(res_txp_filt)
		}


		#' Title
		#' Get the transcript-wise proportion differnces of the specified gene
		#'
		#' @param gID
		#' @param dtu
		#' @param gID gene identifier
		#' @param dturtle dturtle object
		#'
		#' @return
		#' @export
		#'
		#' @examples
		get_diff <- function(gID, dtu){
		group <- dtu$group
		y <- data.frame(row.names = rownames(dtu$drim@fit_full[[gID]]))
		y$a <- apply(dtu$drim@fit_full[[gID]][, which(group==levels(group)[1])], 1, unique)
		y$b <- apply(dtu$drim@fit_full[[gID]][, which(group==levels(group)[2])], 1, unique)
		#' @return Data frame with the transcript-wise proportion differences.
		get_diff <- function(gID, dturtle){
		group <- dturtle$group
		y <- data.frame(row.names = rownames(dturtle$drim@fit_full[[gID]]))
		y$a <- apply(dturtle$drim@fit_full[[gID]][, which(group==levels(group)[1])], 1, unique)
		y$b <- apply(dturtle$drim@fit_full[[gID]][, which(group==levels(group)[2])], 1, unique)
		y$diff <- y$a-y$b
		return(y)
		}

		#' Title
		#' Add a column specifying the maximal difference between the two comparison groups
		#'
		#' @param dtu_table
		#' @param dtu
		#'
		#' @return
		#' @export
		#' @param dtu_table The dtu data frame where the column shall be added.
		#' @param dturtle The corresponding dturtle object the `dtu_table` originates from.
		#'
		#' @examples
		add_max_delta <- function(dtu_table, dtu){
		#' @return A dtu data frame with the added column.
		add_max_delta <- function(dtu_table, dturtle){
		getmax <- function(gID){
		y <- get_diff(gID, dtu)
		y <- get_diff(gID, dturtle)
		#get absoulte maximum while preserving sign
		return(y$diff[which.max(abs(y$diff))])
		}

		dtu_table[[paste0("max(",levels(dtu$group)[1], "-",levels(dtu$group)[2],")")]] <- as.numeric(sapply(dtu_table$geneID, FUN = getmax))
		dtu_table[[paste0("max(",levels(dturtle$group)[1], "-",levels(dturtle$group)[2],")")]] <- as.numeric(sapply(dtu_table$geneID, FUN = getmax))
		return(dtu_table)
		}


		#TODO: Enhance!
		#' Title
		#' Parse a gtf file and return a transcript level dataframe.
		#'
		#' @param gtf_file
		#' @param gtf_file Path to the gtf/gff file that shall be analysed.
		#'
		#' @return
		#' @return A data frame of the availble transcript level information (e.g. the tx2gene mapping information)
		#' @export
		#'
		#' @examples
		create_tx2gene <- function(gtf_file){
		txdb <- GenomicFeatures::makeTxDbFromGFF(gtf_file)
		tx2gene <- select(txdb, keys(txdb, keytype = "TXNAME"), "GENEID", "TXNAME")
		return(tx2gene)
		#' @examples create_tx2gene("path_to/your_annotation_file.gtf")
		import_gtf <- function(gtf_file){
		gtf_grange <- rtracklayer::readGFFAsGRanges("/data/sperm_test/alevin/gencode.v33.annotation.gtf")
		df <- as.data.frame(gtf_grange[gtf_grange$type=="transcript"])
		return(df)
		}


		#' Reorder columns
		#'
		#' Reorder the columns of a data frame.
		#'
		#' Sets the desired `columns` of the dataframe `df` to the first columns. Does not change the order of the others.
		#'
		#' @param df The data frame that shall be reordered.
		#' @param columns One or multiple column names that should be moved to the front of the dataframe. Order is kept.
		#'
		#' @return Data frame with reordered columns.
		#' @export
		#'
		#' @examples move_columns_to_fron(df, c("new_first_column", "new_second_column"))
		move_columns_to_front <- function(df, columns){
		assertthat::assert_that(all(columns %in% colnames(df)), msg = "Could not find all provided column names in data frame.")
		col_order <- setdiff(colnames(df), columns)
		return(df[,c(columns, col_order)])
		}


		#' Title
		#' Remove ensembl version number
		#'
		#' @param x
		#' Remove the version number of ensembl gene/transcript identifiers.
		#'
		#' @return
		#' Removes everything beyond the first dot ('.') in each provied identifier.
		#'
		#' @param x Vector of identifiers.
		#'
		#' @return Vector of identifiers without version numbers.
		#' @export
		#'
		#' @examples
		rm_tx_version <- function(x){
		#' @examples rm_version(c("ENSG00000000001.5","ENST00000000001.2"))
		rm_version <- function(x){
		return(sub("\\..*", "", x))
		}

R/visualization.R

+1 −0

Original line number	Diff line number	Diff line
		@@ -7,6 +7,7 @@
		#'
		#' @return
		#' @family DTUrtle visualization
		#' @export
		#' @seealso
		#'
		#' @examples

Admin message