Multiple small bugfixes. Public bulk data does work now. (2638c512) · Commits · github_fork / DTUrtle

R/dtu_analysis.R

+7 −7

Original line number	Diff line number	Diff line
		@@ -3,7 +3,8 @@
		#' Import the quantification results of many RNAseq quantifiers, including 'alevin' for single-cell data.
		#' Most likely the first step in your DTUrtle analysis.
		#'
		#' Can perform multiple scaling schemes, defaults to scaling schemes appropriate for DTU analysis.
		#' Can perform multiple scaling schemes, defaults to scaling schemes appropriate for DTU analysis. For bulk data it is recommended to additionally specify a `tx2gene` data frame as parameter.
		#' This data frame must be a a two-column data frame linking transcript id (column 1) to gene id/name (column 2). This data frame is used to apply a DTU specific scaling scheme (dtuScaledTPM).
		#'
		#' @param files Vector of files to be imported. Optionally can be named to keep the samples names.
		#' @param type Type of the quantification data. All tools supported by tximport can be selected. If you have single-cell data, the use of `alevin` is proposed.
		@@ -14,10 +15,10 @@
		#' - `'stringtie'`
		#' - `'sailfish'`
		#' - `'none'`
		#' @param ... Further parameters to the specific tximport call. See `tximport()` for available parameters.
		#' @param ... Further parameters to the specific tximport call. See \code{\link[tximport:tximport]{tximport}} for available parameters.
		#'
		#' @return - For bulk data: A combined count matrix for all specified samples.
		#' - For single-cell data (`type='alevin'`): A list of count matrices per sample. Should be combined and optionally added to a Seurat object with `combine_to_matrix()`.
		#' - For single-cell data (`type='alevin'`): A list of count matrices per sample. Should be combined and optionally added to a Seurat object with [combine_to_matrix()].
		#' @family DTUrtle
		#' @export
		#'
		@@ -35,7 +36,6 @@ import_counts <- function(files, type, ...){
		if(hasArg("countsFromAbundance")){
		warning("\nImport of alevin files currently does not support using scaling methods.\nPlease note, that in tagged-end single-cell protocols (like 10X chromium) it is assumed\nthat there is no length effect in the fragment generation process - thus making a scaling unnecessary.")
		}

		for(i in files){
		return_obj <- append(return_obj, tximport::tximport(files = i, type = "alevin", ...)$counts)
		}
		@@ -43,8 +43,8 @@ import_counts <- function(files, type, ...){
		names(return_obj) <- names(files)
		}
		return(return_obj)
		}else{

		}else{
		if(hasArg("countsFromAbundance")){
		if(!args$countsFromAbundance %in% c("dtuScaledTPM", "scaledTPM")){
		warning("It is recommended to use the 'countsFromAbundance' scaling schemes 'dtuScaledTPM' or 'scaledTPM',\nto correct for increased counts of longer transcripts.\nIf you are using a tagged-end protocol, the use 'no' is suggested.")
		@@ -222,7 +222,7 @@ combine_to_matrix <- function(tx_list, cell_extensions=NULL, seurat_obj=NULL, tx
		#' - `'bulk'`: Predefined strategy for bulk RNAseq data. (default)
		#' - `'sc'`: Predefined strategy for single-cell RNAseq data.
		#' - `'own'`: Can be used to specify a user-defined strategy via the `...` argument (using the parameters of `dmFilter()`).
		#' @param BPPARAM If multicore processing should be used, specify a `BiocParallelParam` object here. Among others, can be `SerialParam()` (default) for standard non-multicore processing or `MulticoreParam('number_cores')` for multicore processing. See \code{\link[BiocParallel:BiocParallel-package]{BiocParallel}} for more information.
		#' @param BPPARAM If multicore processing should be used, specify a `BiocParallelParam` object here. Among others, can be `SerialParam()` (default) for non-multicore processing or `MulticoreParam('number_cores')` for multicore processing. See \code{\link[BiocParallel:BiocParallel-package]{BiocParallel}} for more information.
		#' @param force_dense If you do not want to use a sparse Matrix for DRIMSeq calculations, you can force a dense conversion by specifying `TRUE`. Might reduce runtime, but massively increases memory usage. Only recommended if any problems with sparse calculations appear.
		#' @param carry_over_metadata Specify if compatible additional columns of `tx2gene` shall be carried over to the gene and transcript level `meta_table` in the results. Columns with `NA` values are not carried over.
		#' @param ... If `filtering_strategy='own'` specify the wished parameters of `dmFilter()` here.
		@@ -258,7 +258,7 @@ run_drimseq <- function(counts, tx2gene, pd, id_col=NULL, cond_col, cond_levels=
		assertthat::assert_that(is(tx2gene, "data.frame"))
		assertthat::assert_that(is(pd, "data.frame"))
		assertthat::assert_that(ncol(tx2gene)>1)
		assertthat::assert_that(cond_col %in% colnames(pd), msg = paste0("Could not find", cond_col, " in colnames of pd."))
		assertthat::assert_that(cond_col %in% colnames(pd), msg = paste0("Could not find", cond_col, " in column names of pd."))
		assertthat::assert_that(filtering_strategy %in% c("bulk", "sc", "own"), msg = "Please select a valid filtering strategy ('bulk', 'sc' or 'own').")
		assertthat::assert_that(is(BPPARAM, "BiocParallelParam"), msg = "Please provide a valid BiocParallelParam object.")
		assertthat::assert_that(is.logical(force_dense))

R/utility_functions.R

+7 −6

Original line number	Diff line number	Diff line
		@@ -106,9 +106,9 @@ run_posthoc <- function(drim, filt){
		get_diff <- function(gID, dturtle){
		group <- dturtle$group
		y <- data.frame(row.names = rownames(dturtle$drim@fit_full[[gID]]))
		y$a <- apply(dturtle$drim@fit_full[[gID]][, which(group==levels(group)[1])], 1, unique)
		y$b <- apply(dturtle$drim@fit_full[[gID]][, which(group==levels(group)[2])], 1, unique)
		y$diff <- y$a-y$b
		y[levels(group)[1]] <- apply(dturtle$drim@fit_full[[gID]][, which(group==levels(group)[1])], 1, unique)
		y[levels(group)[2]] <- apply(dturtle$drim@fit_full[[gID]][, which(group==levels(group)[2])], 1, unique)
		y$diff <- y[[1]]-y[[2]]
		return(y)
		}

		@@ -133,9 +133,10 @@ getmax <- function(gID, dturtle){
		#' @return A data frame of the availble transcript level information (e.g. the tx2gene mapping information)
		#' @export
		#'
		#' @examples create_tx2gene("path_to/your_annotation_file.gtf")
		#' @examples ## import_gtf("path_to/your_annotation_file.gtf")
		import_gtf <- function(gtf_file){
		gtf_grange <- rtracklayer::readGFFAsGRanges("/data/sperm_test/alevin/gencode.v33.annotation.gtf")
		assertthat::assert_that(file.exists(gtf_file))
		gtf_grange <- rtracklayer::readGFFAsGRanges(gtf_file)
		df <- as.data.frame(gtf_grange[gtf_grange$type=="transcript"])
		return(df)
		}
		@@ -153,7 +154,7 @@ import_gtf <- function(gtf_file){
		#' @return Data frame with reordered columns.
		#' @export
		#'
		#' @examples move_columns_to_fron(df, c("new_first_column", "new_second_column"))
		#' @examples ## move_columns_to_fron(df, c("new_first_column", "new_second_column"))
		move_columns_to_front <- function(df, columns){
		assertthat::assert_that(all(columns %in% colnames(df)), msg = "Could not find all provided column names in data frame.")
		col_order <- setdiff(colnames(df), columns)

R/visualization.R

+39 −26

Original line number	Diff line number	Diff line
		@@ -2,7 +2,7 @@
		#'
		#' Summarize the key results of the DTUrtle analysis to a gene-level data frame.
		#'
		#' This function provides an easy interface to summarize the key DTUrtle results toegther with user-defined meta data columns to a gene-level data frame.
		#' This function provides an easy interface to summarize the key DTUrtle results together with user-defined meta data columns to a gene-level data frame.
		#'
		#' @param dturtle Result object of `posthoc_and_stager()`. Must be of class `dturtle`.
		#' @param add_gene_metadata A list of columns of the object's `meta_table_gene`, the gene-level meta data table.
		@@ -53,7 +53,7 @@ create_dtu_table <- function(dturtle, add_gene_metadata = list("pct_gene_expr"="
		names(valid_cols)[names(valid_cols) == ""] <- unlist(valid_cols[names(valid_cols) == ""])
		}
		colnames(add_table) <- make.names(names(valid_cols))
		dtu_table <- cbind(dtu_table, add_table)
		dtu_table <- cbind(dtu_table, add_table, stringsAsFactors=F)
		}

		if(!is.null(add_tx_metadata)){
		@@ -68,12 +68,11 @@ create_dtu_table <- function(dturtle, add_gene_metadata = list("pct_gene_expr"="
		temp_table <- dturtle$meta_table_tx[dturtle$meta_table_tx$gene %in% dtu_table$geneID, c("gene",unlist(valid_cols)), drop=F]
		add_table <- lapply(dtu_table$geneID, function(gene){
		temp <- temp_table[temp_table$gene==gene,]
		sapply(seq_along(funcs), function(i) funcs[[i]](temp[[i+1]]))
		lapply(seq_along(funcs), function(i) funcs[[i]](temp[[i+1]]))
		})
		if(any(unlist(lapply(add_table, lengths))>1)){
		stop("One or multiple transcript-level summararizations did return more than one value. These were:\n\t", paste0(valid_cols[unique(unlist(lapply(lapply(add_table, lengths) ,function(x) which(x>1))))], collapse = "\n\t"))
		}

		add_table <- do.call(rbind.data.frame, add_table)
		assertthat::assert_that(nrow(add_table)==nrow(dtu_table))
		if(is.null(names(valid_cols))){
		@@ -82,7 +81,7 @@ create_dtu_table <- function(dturtle, add_gene_metadata = list("pct_gene_expr"="
		names(valid_cols)[names(valid_cols) == ""] <- unlist(valid_cols[names(valid_cols) == ""])
		}
		colnames(add_table) <- make.names(names(valid_cols))
		dtu_table <- cbind(dtu_table, add_table)
		dtu_table <- cbind(dtu_table, add_table, stringsAsFactors=F)
		}

		dtu_table <- rapply(dtu_table, as.character, classes="factor", how="replace")
		@@ -340,27 +339,37 @@ plot_dtu_table <- function(dtu, txdf, title, folder, cores=1, image_folder="imag



		#' Title
		#' Visualize as barplot
		#'
		#' Visualize genes and it's transcript proportions in a barplot.
		#'
		#' Shows the transcripts proportional change per analysis group, together with the mean fit value via a horizontal line. Significant transcript's names are marked in red.
		#'
		#' @param dturtle Result object of `posthoc_and_stager()`. Must be of class `dturtle`.
		#' @param genes Character vector of genes to plot. If `NULL`, defaults to all found significant genes (`sig_genes`).
		#' @param meta_gene_id
		#' @param group_colors
		#' @param dash_color
		#' @param savepath
		#' @param meta_gene_id Optionally specify the column name in `meta_table_gene`, which contains gene identifiers.
		#' @param group_colors Optionally specify the colours for the two sample groups in the plot. Must be a named vector, with the group names as names.
		#' @param fit_line_color Optionally specify the colour to use for the mean fit line.
		#' @param savepath If you want your files to be saved to disk, specify a savepath here. The directories will be created, if necessary.
		#' @param filename_ext Optionally specify a file name extension here, which also defines the save image format. The file name will be 'gene_name+extension'.
		#' @param BPPARAM If multicore processing should be used, specify a `BiocParallelParam` object here. Among others, can be `SerialParam()` (default) for non-multicore processing or `MulticoreParam('number_cores')` for multicore processing. See \code{\link[BiocParallel:BiocParallel-package]{BiocParallel}} for more information.
		#' @param ... Further parameters for image saving with ggsave. Can be used to specify plot dimensions, etc. See \code{\link[ggplot2:ggsave]{ggsave}} for more information.
		#'
		#' @return
		#' @return A list of the created plots. Can be further processed if wanted.
		#' @family DTUrtle visualization
		#' @export
		#' @seealso [run_drimseq()] and [posthoc_and_stager()] for DTU object creation. [create_dtu_table()] and [plot_dtu_table()] for table visualization.
		#'
		#' @examples
		plot_prop_bar <- function(dturtle, genes=NULL, meta_gene_id=NULL, group_colors=NULL, dash_color="red", savepath=NULL){
		plot_prop_bar <- function(dturtle, genes=NULL, meta_gene_id=NULL, group_colors=NULL, fit_line_color="red", savepath=NULL, filename_ext="_barplot.png", BPPARAM=BiocParallel::SerialParam(), ...){
		assertthat::assert_that(is(dturtle,"dturtle"), msg = "The provided dturtle object is not of class 'dturtle'.")
		assertthat::assert_that(is.null(genes)\|\|(is(genes,"character")&&length(genes)>0), msg = "The genes object must be a non-empty character vector or NULL.")
		assertthat::assert_that(is.null(meta_gene_id)\|\|(is(meta_gene_id, "character")&&meta_gene_id %in% colnames(dturtle$meta_table_gene)), msg = "The provided meta_gene_id column could not be found or is of wrong format.")
		assertthat::assert_that(is.null(group_colors)\|\|(is(group_colors, "list")&&!is.null(names(group_colors))), msg = "The provided group colors must be a named list or NULL.")
		assertthat::assert_that(is.null(dash_color)\|\|is(dash_color,"character"), msg = "The provided dash_color must be of type character or NULL.")
		assertthat::assert_that(is.null(fit_line_color)\|\|is(fit_line_color,"character"), msg = "The provided fit_line_color must be of type character or NULL.")
		assertthat::assert_that(is.null(savepath)\|\|is(savepath,"character"), msg = "The provided savepath must be of type character or NULL.")
		assertthat::assert_that(is(filename_ext, "character"), msg = "The provided savepath must be of type character." )
		assertthat::assert_that(is(BPPARAM, "BiocParallelParam"), msg = "Please provide a valid BiocParallelParam object.")
		assertthat::assert_that(!is.null(dturtle$sig_gene), msg = "The provided dturtle object does not contain all the needed information. Have you run 'posthoc_and_stager()'?")
		assertthat::assert_that(length(dturtle$sig_gene)>0, msg = "The provided dturtle object does not contain any significant gene. Maybe try to rerun the pipeline with more relaxes thresholds.")
		assertthat::assert_that(!is.null(dturtle$meta_table_gene), msg = "The provided dturtle object does not contain all the needed information. Have you run 'posthoc_and_stager()'?")
		@@ -383,7 +392,11 @@ plot_prop_bar <- function(dturtle, genes=NULL, meta_gene_id=NULL, group_colors=N
		names(gene_ids) <- valid_genes
		}

		plot_list <- lapply(valid_genes, function(gene){
		if(!is.null(savepath) && !dir.exists(savepath)){
		dir.create(file.path(savepath), recursive = T)
		}

		plot_list <- BiocParallel::bplapply(valid_genes, function(gene){
		counts_gene <- as.matrix(dturtle$drim@counts[[gene]])
		group <- dturtle$group

		@@ -408,7 +421,8 @@ plot_prop_bar <- function(dturtle, genes=NULL, meta_gene_id=NULL, group_colors=N
		md <- dturtle$drim@samples

		proportions <- sweep(counts_gene, 2, colSums(counts_gene), "/")
		proportions[is.nan(proportions)] <- NA
		#Nan if counts are all 0 --> 0/0
		proportions[is.nan(proportions)] <- 0
		prop_samp <- data.frame(feature_id = rownames(proportions), proportions, stringsAsFactors = F, check.names = F)
		prop_fit <- data.frame(feature_id = rownames(fit_full), fit_full, stringsAsFactors = F, check.names = F)

		@@ -453,27 +467,26 @@ plot_prop_bar <- function(dturtle, genes=NULL, meta_gene_id=NULL, group_colors=N
		ggp <- ggplot2::ggplot() +
		ggplot2::geom_bar(data = prop_samp, ggplot2::aes_string(x = "feature_id", y = "proportion", group = "sample_id", fill = "group"),
		stat = "identity", position = ggplot2::position_dodge(width = 0.9)) + ggplot2::theme_bw() +
		ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 25, vjust = 1, hjust=1, colour = text_colour), axis.text = ggplot2::element_text(size = 12),
		suppressWarnings(ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 25, vjust = 1, hjust=1, colour = text_colour), axis.text = ggplot2::element_text(size = 12),
		axis.title = ggplot2::element_text(size = 12, face = "bold"), plot.title = ggplot2::element_text(size = 12),
		legend.position = "right", legend.title = ggplot2::element_text(size = 12), legend.text = ggplot2::element_text(size = 12)) +
		legend.position = "right", legend.title = ggplot2::element_text(size = 12), legend.text = ggplot2::element_text(size = 12))) +
		ggplot2::scale_fill_manual(name = "Groups", values = group_colors, breaks = names(group_colors)) +
		ggplot2::labs(title = main, x = "Features", y = "Proportions")

		if(!is.null(dash_color)){
		if(!is.null(fit_line_color)){
		ggp <- ggp + ggplot2::geom_errorbar(data = prop_fit, ggplot2::aes_string(x = "feature_id", ymin = "proportion", ymax = "proportion", group = "sample_id"),
		position = ggplot2::position_dodge(width = 0.9), size=0.5, linetype = "dashed", inherit.aes = F, width = 1, colour = dash_color)
		position = ggplot2::position_dodge(width = 0.9), size=0.5, linetype = "solid", inherit.aes = F, width = 1, colour = fit_line_color)
		}
		if(!is.null(savepath)){
		if(!dir.exists(savepath)){
		dir.create(savepath)
		}
		ggplot2::ggsave(filename = paste0(savepath,make.names(gene),"_bar.png"), plot = ggp, ...)
		ggplot2::ggsave(filename = file.path(savepath,paste0(make.names(gene),filename_ext)), plot = ggp, ...)
		}
		return(setNames(list(ggp),gene))
		})
		return(plot_list)
		return(ggp)
		}, BPPARAM = BPPARAM)
		return(setNames(plot_list, valid_genes))
		}



		plot_heat_per_gene <- function(gene_id, txdf, all_counts, mut, mut_genes, path){
		#gene <- gene_id
		gene_name <- txdf$external_gene_name[match(gene_id, txdf$GENEID)]

man/create_dtu_table.Rd

+1 −1

Original line number	Diff line number	Diff line
		@@ -28,7 +28,7 @@ An extended \code{dturtle} object, including the added \code{dtu_table}.
		Summarize the key results of the DTUrtle analysis to a gene-level data frame.
		}
		\details{
		This function provides an easy interface to summarize the key DTUrtle results toegther with user-defined meta data columns to a gene-level data frame.
		This function provides an easy interface to summarize the key DTUrtle results together with user-defined meta data columns to a gene-level data frame.
		}
		\seealso{
		\code{\link[=run_drimseq]{run_drimseq()}} and \code{\link[=posthoc_and_stager]{posthoc_and_stager()}} for DTU object creation. \code{\link[=plot_dtu_table]{plot_dtu_table()}} for table visualization.

man/import_counts.Rd

+4 −3

Original line number	Diff line number	Diff line
		@@ -20,12 +20,12 @@ import_counts(files, type, ...)
		\item \code{'none'}
		}}

		\item{...}{Further parameters to the specific tximport call. See \code{tximport()} for available parameters.}
		\item{...}{Further parameters to the specific tximport call. See \code{\link[tximport:tximport]{tximport}} for available parameters.}
		}
		\value{
		\itemize{
		\item For bulk data: A combined count matrix for all specified samples.
		\item For single-cell data (\code{type='alevin'}): A list of count matrices per sample. Should be combined and optionally added to a Seurat object with \code{combine_to_matrix()}.
		\item For single-cell data (\code{type='alevin'}): A list of count matrices per sample. Should be combined and optionally added to a Seurat object with \code{\link[=combine_to_matrix]{combine_to_matrix()}}.
		}
		}
		\description{
		@@ -33,7 +33,8 @@ Import the quantification results of many RNAseq quantifiers, including 'alevin'
		Most likely the first step in your DTUrtle analysis.
		}
		\details{
		Can perform multiple scaling schemes, defaults to scaling schemes appropriate for DTU analysis.
		Can perform multiple scaling schemes, defaults to scaling schemes appropriate for DTU analysis. For bulk data it is recommended to additionally specify a \code{tx2gene} data frame as parameter.
		This data frame must be a a two-column data frame linking transcript id (column 1) to gene id/name (column 2). This data frame is used to apply a DTU specific scaling scheme (dtuScaledTPM).
		}
		\seealso{
		Other DTUrtle:

Admin message