Commit 1df29a16 authored by tekath's avatar tekath
Browse files

Get rid of sweep(), fastening sparse computations. Add proportion_table function.

parent fc276cfe
Loading
Loading
Loading
Loading
+4 −5
Original line number Diff line number Diff line
Package: DTUrtle
Type: Package
Title: What the Package Does (Title Case)
Title: Differential Transcript Usage analysis
Version: 0.1.0
Author: Tobias Tekath
Maintainer: The package maintainer <yourself@somewhere.net>
Description: More about what it does (maybe more than one line)
    Use four spaces when indenting paragraphs within the Description.
Maintainer: Tobias Tekath <tobias.tekath@wwu.de>
Description: Easily perform Differential Transcript Usage (DTU) analysis for bulk or single-cell RNAseq data. Needs transcript level counts.
License: GPL (>= 3)
Encoding: UTF-8
LazyData: true
Depends: R (>= 3.4.0)
Imports: Matrix, tximport, sparseDRIMSeq, stageR, rtracklayer, formattable, DT, Gviz, assertthat, scales, ggplot2, reshape2, BiocParallel
Imports: Matrix, tximport, sparseDRIMSeq, stageR, rtracklayer, formattable, DT, Gviz, assertthat, scales, ggplot2, reshape2, BiocParallel, Matrix.utils, pheatmap
RoxygenNote: 7.1.0
Roxygen: list(markdown = TRUE)
+3 −0
Original line number Diff line number Diff line
@@ -4,6 +4,7 @@ export(check_unique_by_partition)
export(combine_to_matrix)
export(create_dtu_table)
export(get_by_partition)
export(get_proportion_matrix)
export(granges_reduce_introns)
export(import_counts)
export(import_gtf)
@@ -11,6 +12,8 @@ export(move_columns_to_front)
export(one_to_one_mapping)
export(plot_dtu_table)
export(plot_proportion_barplot)
export(plot_transcripts_view)
export(posthoc_and_stager)
export(rm_version)
export(run_drimseq)
export(summarize_to_gene)
+19 −40
Original line number Diff line number Diff line
@@ -30,6 +30,7 @@
import_counts <- function(files, type, ...){
    assertthat::assert_that(type %in% c("salmon", "alevin", "kallisto", "bustools", "rsem", "stringtie", "sailfish", "none"))
    assertthat::assert_that(length(type)==1)
    assertthat::assert_that(length(files)>=1)
    message("Reading in ", length(files), " ", type, " runs.")

    args=list(...)
@@ -42,6 +43,7 @@ import_counts <- function(files, type, ...){
        }

        if(type=="alevin"){
            assertthat::assert_that(all(basename(files) == "quants_mat.gz"), msg = "Expecting 'files' to point to 'quants_mat.gz' file in a directory 'alevin'\n  also containing 'quants_mat_rows.txt' and 'quant_mat_cols.txt'.\n  Please re-run alevin preserving output structure.")
            return_obj <- lapply(files, function(i) tximport::tximport(files = i, type = "alevin", ...)$counts)
        }else{
            return_obj <- lapply(files, function(i) readin_bustools(files = i))
@@ -211,6 +213,7 @@ combine_to_matrix <- function(tx_list, cell_extensions=NULL, seurat_obj=NULL, tx
    }
}

#TODO: Carry over sample_pd if Seurat
#TODO: Specify predefined strategies!
#' Main DRIMSeq results
#'
@@ -222,22 +225,23 @@ combine_to_matrix <- function(tx_list, cell_extensions=NULL, seurat_obj=NULL, tx
#' 1. (sparse) matrix with feature counts, where the rows correspond to features (e.g. transcripts). One column per sample/cell with the count data for the specified features must be present (The names of these columns must match with the identifiers in `id_col`).
#' 2. Seurat object with a transcription level assay as `active.assay` (most likely result object from [combine_to_matrix()])
#' @param tx2gene Data frame, where the first column consists of feature identifiers and the second column consists of corresponding gene identifiers. Feature identifiers must match with the rownames of the counts object. If a Seurat object is provided in `counts` and `tx2gene` was provided in [combine_to_matrix()], a vector of the colnames of the specific feature and gene identifierss is sufficient.
#' @param pd Data frame with at least a column of sample/cell identifiers (specified in `id_col`) and the comparison group definition (specified in `cond_col`).
#' @param pd Data frame with at least a column of sample/cell identifiers (rownames or specified in `id_col`) and the comparison group definition (specified in `cond_col`).
#' @param id_col Name of the column in `pd`, where unique sample/cell identifiers are stored. If `NULL` (default), use rownames of `pd`.
#' @param cond_col Name of the column in `pd`, where the comparison groups are defined. If more than 2 levels/groups are present, the groups that should be used must be specified in `cond_levels`.
#' @param cond_col Name of the column in `pd`, where the comparison groups/conditions are defined. If more than 2 levels/groups are present, the groups that should be used must be specified in `cond_levels`.
#' @param cond_levels Define two levels/groups of `cond_col`, that should be compared. The order of the levels states the comparison formula (i.e. `cond_col[1]-cond_col[2]`).
#' @param filtering_strategy Define the filtering strategy to reduce and noise and increase statistical power.
#' - `'bulk'`: Predefined strategy for bulk RNAseq data. (default)
#' - `'sc'`: Predefined strategy for single-cell RNAseq data.
#' - `'own'`: Can be used to specify a user-defined strategy via the `...` argument (using the parameters of `dmFilter()`).
#' @param BPPARAM If multicore processing should be used, specify a `BiocParallelParam` object here. Among others, can be `SerialParam()` (default) for non-multicore processing or `MulticoreParam('number_cores')` for multicore processing. See \code{\link[BiocParallel:BiocParallel-package]{BiocParallel}} for more information.
#' @param force_dense If you do not want to use a sparse Matrix for DRIMSeq calculations, you can force a dense conversion by specifying `TRUE`. Might reduce runtime, but massively increases memory usage. Only recommended if any problems with sparse calculations appear.
#' @param force_dense If you do not want to use a sparse Matrix for DRIMSeq calculations, you can force a dense conversion by specifying `TRUE`. Increases memory usage, but also reduces runtime drastically (currently).
#' @param carry_over_metadata Specify if compatible additional columns of `tx2gene` shall be carried over to the gene and transcript level `meta_table` in the results. Columns with `NA` values are not carried over.
#' @param ... If `filtering_strategy='own'` specify the wished parameters of `dmFilter()` here.
#'
#' @return `dturtle` object with the key results, that can be used in the DTUrtle steps hereafter. The object is just a easily accesible list with the following items:
#' - `meta_table_gene`: Data frame of the expressed-in ratio of all genes. Expressed-in is defined as expression > 0. Can be used to add gene level metainformation for plotting.
#' - `meta_table_tx`: Data frame of the expressed-in ratio of all transcripts. Expressed-in is defined as expression > 0. Can be used to add transcript level metainformation for plotting.
#' - `meta_table_gene`: Data frame of the expressed-in ratio of all genes. Expressed-in is defined as expression > 0. Can be used to add gene level meta-information for plotting.
#' - `meta_table_tx`: Data frame of the expressed-in ratio of all transcripts. Expressed-in is defined as expression > 0. Can be used to add transcript level meta-information for plotting.
#' - `meta_table_sample`: Data frame of the provided sample level information (`pd`). Can be used to add sample level meta-information for plotting.
#' - `drim`: The results of the DRIMSeq statistical computations (`dmTest()`).
#' - `design`: The design matrix generated from the specified `pd` columns.
#' - `group`: Vector which sample/cell belongs to which comparison group.
@@ -247,9 +251,7 @@ combine_to_matrix <- function(tx_list, cell_extensions=NULL, seurat_obj=NULL, tx
#' @export
#'
#' @examples
run_drimseq <- function(counts, tx2gene, pd, id_col=NULL, cond_col, cond_levels=NULL, filtering_strategy="bulk", BPPARAM=BiocParallel::SerialParam(), force_dense=F, carry_over_metadata=T, ...){
    tictoc::tic("Total time")
    tictoc::tic("Preprocess")
run_drimseq <- function(counts, tx2gene, pd, id_col=NULL, cond_col, cond_levels=NULL, filtering_strategy="bulk", BPPARAM=BiocParallel::SerialParam(), force_dense=T, carry_over_metadata=T, ...){
    if(is(counts, "Seurat")){
        assertthat::assert_that(require("Seurat", character.only = T), msg = "The package Seurat is needed for adding the combined matrix to a seurat object.")
        assertthat::assert_that(packageVersion("Seurat")>="3.0.0", msg = "At least Version 3 of Seurat is needed. Currently only Seurat 3 objects are supported.")
@@ -281,14 +283,9 @@ run_drimseq <- function(counts, tx2gene, pd, id_col=NULL, cond_col, cond_levels=
    colnames(tx2gene) <- make.names(colnames(tx2gene), unique = T)

    if(is.null(cond_levels)){
        if(length(unique(pd[[cond_col]]))==2){
            cond_levels <- unique(pd[[cond_col]])
    }
        else{
            stop("More than two levels found in 'cond_col'. Please specify the two levels you want to compare in 'cond_levels'.")
        }
    }
    assertthat::assert_that(length(cond_levels)==2)
    assertthat::assert_that(length(cond_levels)==2, msg = "More than two levels found in 'cond_col'. Please specify the two levels you want to compare in 'cond_levels'.")
    message("\nComparing ", cond_levels[1], " vs ", cond_levels[2])

    if(is.null(id_col)){
@@ -334,9 +331,7 @@ run_drimseq <- function(counts, tx2gene, pd, id_col=NULL, cond_col, cond_levels=
    own={
        filter_opt_list <- modifyList(filter_opt_list, list(...))
    })
    tictoc::tic("Filter")
    counts <- do.call(sparse_filter, args = c(list("counts" = counts, "tx2gene" = tx2gene, "BPPARAM" = BPPARAM), filter_opt_list), quote=TRUE)
    tictoc::toc(log = T)
    tx2gene <- tx2gene[match(rownames(counts), tx2gene$feature_id),]

    if(is(counts, 'sparseMatrix')&force_dense){
@@ -353,11 +348,8 @@ run_drimseq <- function(counts, tx2gene, pd, id_col=NULL, cond_col, cond_levels=
    }

    drim <- sparseDRIMSeq::sparse_dmDSdata(tx2gene = tx2gene, counts = counts, samples = samp)
    tictoc::tic("Ratios")
    exp_in_tx <- ratio_expression_in(drim, "tx", BPPARAM=BPPARAM)
    exp_in_gn <- ratio_expression_in(drim, "gene", BPPARAM=BPPARAM)
    tictoc::toc(log = T)
    tictoc::tic("Metadata")
    #carry over metadata
    if(carry_over_metadata&ncol(tx2gene)>2){
        tx2gene <- tx2gene[match(rownames(exp_in_tx), tx2gene$feature_id),]
@@ -371,28 +363,19 @@ run_drimseq <- function(counts, tx2gene, pd, id_col=NULL, cond_col, cond_levels=
            }
        }
    }
    tictoc::toc(log = T)
    design_full <- model.matrix(~condition, data=samp)

    tictoc::toc(log = T)
    message("\nPerforming statistical tests...\n")
    tictoc::tic("Precision")
    drim_test <- sparseDRIMSeq::dmPrecision(drim, design=design_full, prec_subset=1, BPPARAM=BPPARAM, add_uniform=T)
    tictoc::toc(log = T)
    tictoc::tic("Fit")
    drim_test <- sparseDRIMSeq::dmFit(drim_test, design=design_full, BPPARAM=BPPARAM, add_uniform=T)
    tictoc::toc(log = T)
    tictoc::tic("Test")
    drim_test <- sparseDRIMSeq::dmTest(drim_test, coef=colnames(design_full)[2], BPPARAM=BPPARAM)
    tictoc::toc(log = T)

    group <- factor(samp$condition, levels = cond_levels, ordered = T)
    tictoc::toc(log = T)

    exp_in_gn <- rapply(exp_in_gn, as.character, classes="factor", how="replace")
    exp_in_tx <- rapply(exp_in_tx, as.character, classes="factor", how="replace")

    return_obj <- list("meta_table_gene"=exp_in_gn, "meta_table_tx"=exp_in_tx,
    return_obj <- list("meta_table_gene"=exp_in_gn, "meta_table_tx"=exp_in_tx, "meta_table_sample"=samp,
                       "drim"=drim_test, "design_full"=design_full, "group"=group,
                       "used_filtering_options"=list("DRIM"=filter_opt_list), "tictoc"=tictoc::tic.log(format = T))
    class(return_obj) <- append("dturtle", class(return_obj))
@@ -401,7 +384,6 @@ run_drimseq <- function(counts, tx2gene, pd, id_col=NULL, cond_col, cond_levels=


#TODO: implement Noise - IQR filtering
#TODO: Posthoc!

#' Posthoc filtering and two-staged statistical tests
#'
@@ -414,7 +396,7 @@ run_drimseq <- function(counts, tx2gene, pd, id_col=NULL, cond_col, cond_levels=
#'
#' @param dturtle Result object of [run_drimseq()]. Must be of class `dturtle`.
#' @param ofdr Overall false discovery rate (OFDR) threshold.
#' @param posthoc_filt Specify the minimal standard deviation of a transcripts porportion level that should be kept when performing posthoc filtering. To disbale poshoc filtering 0 or `FALSE` can be provided.
#' @param posthoc Specify the minimal standard deviation of a transcripts porportion level that should be kept when performing posthoc filtering. To disbale poshoc filtering 0 or `FALSE` can be provided.
#' @return An extended `dturtle` object. Additional slots include:
#' - `sig_gene`: A character vector of all genes where the first stageR step was significant. Basically the significant genes, that showed signs of DTU.
#' - `sig_tx` : A named character vector of transcripts where the second stageR step was significant. Basically the significant transcripts of the significant genes.
@@ -426,19 +408,16 @@ run_drimseq <- function(counts, tx2gene, pd, id_col=NULL, cond_col, cond_levels=
#'
#' @examples
posthoc_and_stager <- function(dturtle, ofdr=0.05, posthoc=0.1){
    assertthat::assert_that(is(dturtle,"dturtle"), msg = "The provided dturtle object is not of class 'dturtle'.")
    if(posthoc!=F){
        assertthat::assert_that(0<=posthoc & posthoc<=1, msg = "The provided 'posthoc' parameter is invalid. Must be a number between [0,1] or FALSE.")
    }else{
        assertthat::assert_that(posthoc==F, msg = "The provided 'posthoc' parameter is invalid. Must be a number between [0,1] or FALSE.")
    }
    assertthat::assert_that(!is.null(dturtle$drim), msg = "The provided dturtle object does not contain all the needed information. Have you run 'run_drimseq()'?")
    assertthat::assert_that((is.numeric(posthoc)&0<=posthoc & posthoc<=1)||isFALSE(posthoc), msg = "The provided 'posthoc' parameter is invalid. Must be a number between [0,1] or FALSE.")
    assertthat::assert_that(0<=ofdr & ofdr<=1, msg = "The provided 'ofdr' parameter is invalid. Must be a number between [0,1].")

    res <- sparseDRIMSeq::results(dturtle$drim)
    res_txp <- sparseDRIMSeq::results(dturtle$drim, level="feature")

    if(posthoc!=F|posthoc>0){
    res <- sparseDRIMSeq::results(dturtle$drim)
    if(posthoc!=F||posthoc>0){
        res_txp <- run_posthoc(dturtle$drim, posthoc)
    }else{
        res_txp <- sparseDRIMSeq::results(dturtle$drim, level="feature")
    }

    res$pvalue <- no_na(res$pvalue)
+3 −3
Original line number Diff line number Diff line
@@ -2,7 +2,7 @@
#'
#' Perform dmFilter-like filtering for sparse or dense matrices
#'
#' Runtime optimised version of `DRIMSeq::dmFilter`, which can optionally be executed in parallel.
#' Runtime optimised version of \code{\link[sparseDRIMSeq:dmFilter]{dmFilter()}}, which can optionally be executed in parallel.
#'
#' @param counts Sparse count matrix.
#' @param tx2gene Feature to gene mapping.
@@ -45,7 +45,7 @@ sparse_filter <- function(counts, tx2gene, BPPARAM=BiocParallel::SerialParam(),
            return(NULL)

        temp <- expr_features[, samps2keep, drop = FALSE]
        prop <- sweep(temp, 2, Matrix::colSums(temp), "/")
        prop <- temp %*% Matrix::diag(1/Matrix::colSums(temp))

        ### features with min proportion
        row_index <- Matrix::rowSums(prop >= min_feature_prop) >= min_samps_feature_prop
@@ -104,5 +104,5 @@ readin_bustools <- function(files){
    colnames <- scan(colnames, what = "character", quiet=T)
    assertthat::assert_that(length(colnames)==ncol(mtx), msg = "Number of features does not match to matrix.")
    dimnames(mtx) <- list(rownames, colnames)
    return(as(mtx, "dgCMatrix"))
    return(as(Matrix::t(mtx), "dgCMatrix"))
}
+108 −23
Original line number Diff line number Diff line
@@ -7,27 +7,23 @@ no_na <- function(x){
    return(ifelse(is.na(x), 1, x))
}

#TODO: test
#Not working for sparse! Need grouped row sums like rowsums
#' Filter out
#' Filter out results, whose standard deviation of proportional ratios is below the filter value.
#'
#' @param d
#' @param filter
#' @param drim Results object of DRIMSeq statistical computations
#' @param filter A filter threshold, stating the minimal standard deviation to keep.
#'
#' @return
smallProportionSD <- function(d, filter) {
    browser()
    d <- sparseDRIMSeq::counts(d)
    cts <- as.matrix(d[,!colnames(d) %in% c("gene_id", "feature_id")])
    gene.cts <- rowsum(cts, d$gene_id)
    total.cts <- gene.cts[match(d$gene_id, rownames(gene.cts)),]
    props <- cts/total.cts
    #propSD <- sqrt(matrixStats::rowVars(props))

    #directly call SD?
    propSD <- sqrt(apply(props,1,var))

    return(propSD < filter)
#' @return A boolean vector, stating the elements to dismiss (==True).
smallProportionSD <- function(drim, filter) {
    cts <- drim@counts@unlistData
    part <- drim@counts@partitioning
    dismiss <- unlist(lapply(X=part, FUN=function(x){
        temp <- cts[x,]
        prop <- temp %*% Matrix::diag(1/Matrix::colSums(temp))
        prop_sd <- prop-Matrix::rowMeans(prop)
        prop_sd <- sqrt(Matrix::rowSums(prop_sd*prop_sd)/(ncol(prop_sd)-1))
        return(prop_sd < filter)
        }))
    return(dismiss)
}

#' Merge two or more sparse matrices.
@@ -93,7 +89,7 @@ run_posthoc <- function(drim, filt){
    filt <- smallProportionSD(drim, filt)
    res_txp_filt$pvalue[filt] <- 1
    res_txp_filt$adj_pvalue[filt] <- 1
    message("Posthoc filtered ", sum(filt, na.rm = TRUE), " transcripts")
    message("Posthoc filtered ", sum(filt, na.rm = T), " features.")
    return(res_txp_filt)
}

@@ -285,6 +281,95 @@ get_by_partition <- function(df, partitioning, FUN, columns=NULL, simplify=T, dr
}



#' Summarize matrix to gene level
#'
#' Summarize a transcript level matrix to gene level.
#'
#' Can be used with sparse or dense expression matrices
#'
#' @param mtx A (sparse) expression matrix on transcript / feature level
#' @param tx2gene A data frame, mapping the `mtx` rownames (first column) to genes (second column).
#' @inheritParams Matrix.utils::aggregate.Matrix
#'
#' @return A summarised (sparse) matrix
#' @export
summarize_to_gene <- function(mtx, tx2gene, fun="sum"){
    assertthat::assert_that(is(mtx, "matrix")||is(mtx, "sparseMatrix"))
    assertthat::assert_that(is.data.frame(tx2gene))
    assertthat::assert_that(all(rownames(mtx) %in% tx2gene[[1]]))

    tx2gene <- tx2gene[match(rownames(mtx), tx2gene[[1]]),]

    if(is(mtx, "sparseMatrix")){
        return(Matrix.utils::aggregate.Matrix(x = mtx, groupings = tx2gene[[2]], fun = fun))
    }else{
        return(as.matrix(Matrix.utils::aggregate.Matrix(x = mtx, groupings = tx2gene[[2]], fun = fun)))
    }

}





#' Actual computation of proportion matrices
#'
#' @param mtx A (sparse) transcript-level matrix
#' @param partitioning A DRIMSeq partitioning list, specifying which transcripts belong to which genes.
#'
#' @return A (sparse) matrix of transcript proportions
prop_matrix <- function(mtx, partitioning=NULL){
    assertthat::assert_that(is(mtx, "matrix")||is(mtx, "sparseMatrix"))
    assertthat::assert_that(is(partitioning, "list")||is.null(partitioning))

    if(!is.null(partitioning)){
        if(sum(lengths(partitioning))!=nrow(mtx)){
            part_df <- data.frame("gene"=rep(names(partitioning),lengths(partitioning)), "tx"=unlist(sapply(partitioning, FUN = names)), stringsAsFactors = F)
            part_df <- part_df[match(rownames(mtx), part_df$tx),]
            partitioning <- split(part_df$tx,part_df$gene)
        }
        mat_list <- sapply(X=partitioning, FUN=function(x){
            temp <- mtx[x,]
            return(temp %*% Matrix::diag(1/Matrix::colSums(temp)))
        })
        return(do.call(rbind,mat_list))

    }else{
        return(mtx %*% Matrix::diag(1/Matrix::colSums(mtx)))
    }
}


#' Compute proportion matrix
#'
#' Compute a matrix of transcript proportions per gene. Either a (sparse) count matrix or a `dturtle` object can be provided.
#'
#' If a `dturtle` object is provided, a list of genes the data shall be subsetted to can optionally be given.
#'
#' @param obj (sparse) matrix or `dturtle` object.
#' @param genes Specify the genes to subset the data to, if a `dturtle` object is provided.
#'
#' @return A (sparse) matrix of transcript proportions
#' @export
#'
#' @examples
get_proportion_matrix <- function(obj,genes=NULL){
    assertthat::assert_that(is(obj, "matrix")||is(obj, "sparseMatrix")||is(obj, "dturtle"), msg = "obj must be a (sparse) matrix or of class 'dturtle'.")
    assertthat::assert_that(is.null(genes)||(is(genes,"character")&&length(genes)>0), msg = "The genes object must be a non-empty character vector or NULL.")
    if(is(obj, "matrix")||is(obj, "sparseMatrix")){
        return(prop_matrix(obj))
    }else{
        assertthat::assert_that(!is.null(obj$drim), msg="obj does not contain DRIMSeq results.")
        if(!is.null(genes)){
            return(prop_matrix(do.call(rbind,obj$drim@counts@unlistData[genes,]), partitioning = obj$drim@counts@partitioning))
        }else{
            return(prop_matrix(obj$drim@counts@unlistData, partitioning = obj$drim@counts@partitioning))
        }

    }
}

#' Ensure one-to-one mapping
#'
#' Ensure one-to-one mapping of the two specified vectors.
@@ -299,7 +384,7 @@ get_by_partition <- function(df, partitioning, FUN, columns=NULL, simplify=T, dr
#' @return The vector `name`, where one to one mapping for the two vectors is ensured.
#' @export
one_to_one_mapping <- function(name, id, ext="_"){
    assertthat::assert_that(length(name)==length(id)&length(id)>0)
    assertthat::assert_that(length(name)==length(id)&&length(id)>0)
    assertthat::assert_that(is(name, "character"))
    assertthat::assert_that(is(ext, "character")&&length(ext)==1)

Loading