Commit fc276cfe authored by tekath's avatar tekath
Browse files

Overworked transcripts_view and proportion_barplot. Added basic bustools support.

parent 2638c512
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -4,12 +4,13 @@ export(check_unique_by_partition)
export(combine_to_matrix)
export(create_dtu_table)
export(get_by_partition)
export(granges_reduce_introns)
export(import_counts)
export(import_gtf)
export(move_columns_to_front)
export(one_to_one_mapping)
export(plot_dtu_table)
export(plot_prop_bar)
export(plot_proportion_barplot)
export(posthoc_and_stager)
export(rm_version)
export(run_drimseq)
+30 −20
Original line number Diff line number Diff line
#' Import quantification results
#'
#' Import the quantification results of many RNAseq quantifiers, including 'alevin' for single-cell data.
#' Import the quantification results of many RNA-seq quantifiers, including `alevin` and `bustools` for single-cell data.
#' Most likely the first step in your DTUrtle analysis.
#'
#' Can perform multiple scaling schemes, defaults to scaling schemes appropriate for DTU analysis. For bulk data it is recommended to additionally specify a `tx2gene` data frame as parameter.
#' This data frame must be a a two-column data frame linking transcript id (column 1) to gene id/name (column 2). This data frame is used to apply a DTU specific scaling scheme (dtuScaledTPM).
#' Please see [import_gtf()], [move_columns_to_front()] and [one_to_one_mapping()] to help with tx2gene creation.
#' See also [combine_to_matrix()], when output is a list of single-cell runs.
#'
#' @param files Vector of files to be imported. Optionally can be named to keep the samples names.
#' @param type Type of the quantification data. All tools supported by tximport can be selected. If you have single-cell data, the use of `alevin` is proposed.
#' @param type Type of the quantification data. All tools supported by \code{\link[tximport:tximport]{tximport}} can be selected, additionally to the newly implemented `bustools` support for single-cell data. If you have single-cell data, the use of `alevin` or `bustools` is proposed.
#' - `'salmon'`
#' - `'alevin'`
#' - `'kallisto'`
#' - `'bustools'`
#' - `'rsem'`
#' - `'stringtie'`
#' - `'sailfish'`
@@ -21,24 +24,29 @@
#' - For single-cell data (`type='alevin'`): A list of count matrices per sample. Should be combined and optionally added to a Seurat object with [combine_to_matrix()].
#' @family DTUrtle
#' @export
#' @seealso Please see [import_gtf()], [move_columns_to_front()] and [one_to_one_mapping()] to help with tx2gene creation. See also [combine_to_matrix()], when output is a list of single-cell runs.
#'
#' @examples
import_counts <- function(files, type, ...){
    assertthat::assert_that(type %in% c("salmon", "alevin", "kallisto", "bustools", "rsem", "stringtie", "sailfish", "none"))
    assertthat::assert_that(length(type)==1)
    assertthat::assert_that(type %in% c("salmon", "alevin", "kallisto", "rsem", "stringtie", "sailfish", "none"))
    message("Reading in ", length(files), " ", type, " runs.")

    args=list(...)

    if(type=="alevin"){
    if(type=="alevin"||type=="bustools"){
        return_obj = list()

        if(hasArg("countsFromAbundance")){
            warning("\nImport of alevin files currently does not support using scaling methods.\nPlease note, that in tagged-end single-cell protocols (like 10X chromium) it is assumed\nthat there is no length effect in the fragment generation process - thus making a scaling unnecessary.")
            warning(paste0("\nImport of ", type," files currently does not support using scaling methods.\nPlease note, that in tagged-end single-cell protocols (like 10X chromium or SureCell) it is assumed\nthat there is no length effect in the fragment generation process - thus making a scaling unnecessary."))
        }
        for(i in files){
            return_obj <- append(return_obj, tximport::tximport(files = i, type = "alevin", ...)$counts)

        if(type=="alevin"){
            return_obj <- lapply(files, function(i) tximport::tximport(files = i, type = "alevin", ...)$counts)
        }else{
            return_obj <- lapply(files, function(i) readin_bustools(files = i))
        }

        if(!is.null(names(files))){
            names(return_obj) <- names(files)
        }
@@ -74,16 +82,16 @@ import_counts <- function(files, type, ...){

#' Combine sparse matrices.
#'
#' Combine list of sparse transcription count matrices.
#' Combine list of sparse transcription count matrices. Presumably coming from [import_counts()] with single-cell data.
#'
#' Only needed when dealing with single-cell data. Adds a cellname extension if necessary.
#' This function adds a cellname extension if necessary.
#' Can optionally add the combined matrix to a existing Seurat object (keeping the cellname extension of the object and matching the cells).
#' Also removes overall non expressed features.
#'
#' @param tx_list List of sparse transcription count matrices, as returned by `import_counts()` for single-cell data.
#' @param cell_extensions Optional list of cellname extensions that are added to the cellnames of one sample. The cellnames and the extension are separated by an underscore '_'.
#' @param tx_list List of sparse transcription count matrices, as returned by [import_counts()] for single-cell data.
#' @param cell_extensions Optional list of cellname extensions that are added to the cellnames of the samples. The original cellnames and the extension are separated by an underscore '_'.
#' @param seurat_obj Optional seurat object, where the combined matrix is added as an assay. This has the advantage, that the cells are matched and subsetted if necessary. Currently only Seurat 3 objects are supported.
#' @param tx2gene Optional tx2gene/metadata data frame, can only be used in conjunction with a seurat object. Metadata is added as feature-level meta data to the created assay. The first column of the data frame must contain transcript names/ids. The same transcript names/ids as in the `tx_list` objects must be used.
#' @param tx2gene Optional tx2gene or metadata data frame, can only be used in conjunction with a seurat object. Metadata is added as feature-level meta data to the created assay. The first column of the data frame must contain transcript names/ids. The same transcript names/ids as in the `tx_list` objects must be used.
#' @param assay_name If the combined matrix should be added to an existing Seurat object, the name of the assay can be specified here.
#'
#' @return Either a combined sparse transcription count matrix or a seurat object which the combined sparse transcription count matrix as an assay.
@@ -211,9 +219,9 @@ combine_to_matrix <- function(tx_list, cell_extensions=NULL, seurat_obj=NULL, tx
#' Run the main DRIMSeq pipeline, including generation of a design matrix, gene/feature filtering and running the statistical computations of DRIMSeq (`dmPrecision()`, `dmFit()` and `dmTest()`)
#'
#' @param counts Can be either:
#' 1. Data frame or matrix with feature counts, where the rows correspond to features (e.g. transcripts). One column per sample/cell with the count data for the specified features must be present (The names of these columns must match with the identifiers in `id_col`).
#' 2. Seurat object with a transcription level assay as `active.assay` (most likely result object from `combine_to_matrix()`)
#' @param tx2gene Data frame, where the first column consists of feature identifiers and the second column consists of corresponding gene identifiers. Feature identifiers must match with the rownames of the counts object. If a Seurat object is provided in `counts` and `tx2gene` was provided in `combine_to_matrix()`, a vector of the colnames of the specific feature and gene identifierss is sufficient.
#' 1. (sparse) matrix with feature counts, where the rows correspond to features (e.g. transcripts). One column per sample/cell with the count data for the specified features must be present (The names of these columns must match with the identifiers in `id_col`).
#' 2. Seurat object with a transcription level assay as `active.assay` (most likely result object from [combine_to_matrix()])
#' @param tx2gene Data frame, where the first column consists of feature identifiers and the second column consists of corresponding gene identifiers. Feature identifiers must match with the rownames of the counts object. If a Seurat object is provided in `counts` and `tx2gene` was provided in [combine_to_matrix()], a vector of the colnames of the specific feature and gene identifierss is sufficient.
#' @param pd Data frame with at least a column of sample/cell identifiers (specified in `id_col`) and the comparison group definition (specified in `cond_col`).
#' @param id_col Name of the column in `pd`, where unique sample/cell identifiers are stored. If `NULL` (default), use rownames of `pd`.
#' @param cond_col Name of the column in `pd`, where the comparison groups are defined. If more than 2 levels/groups are present, the groups that should be used must be specified in `cond_levels`.
@@ -404,7 +412,7 @@ run_drimseq <- function(counts, tx2gene, pd, id_col=NULL, cond_col, cond_levels=
#' The second step tries to identify on singular transcript level the significantly different transcripts.
#'
#'
#' @param dturtle Result object of `run_drimseq()`. Must be of class `dturtle`.
#' @param dturtle Result object of [run_drimseq()]. Must be of class `dturtle`.
#' @param ofdr Overall false discovery rate (OFDR) threshold.
#' @param posthoc_filt Specify the minimal standard deviation of a transcripts porportion level that should be kept when performing posthoc filtering. To disbale poshoc filtering 0 or `FALSE` can be provided.
#' @return An extended `dturtle` object. Additional slots include:
@@ -444,17 +452,19 @@ posthoc_and_stager <- function(dturtle, ofdr=0.05, posthoc=0.1){
    stageRObj <- stageR::stageRTx(pScreen = pscreen, pConfirmation = pconfirm, pScreenAdjusted = F, tx2gene = tx2gene)
    stageRObj <- stageR::stageWiseAdjustment(stageRObj, method = "dtu", alpha = ofdr)
    fdr_table <- stageR::getAdjustedPValues(stageRObj, order = F, onlySignificantGenes = F)
    sig_gene <- unique(as.character(fdr_table$geneID[fdr_table$gene<ofdr]))
    fdr_table <- rapply(fdr_table, as.character, classes="factor", how="replace")
    sig_gene <- unique(fdr_table$geneID[fdr_table$gene<ofdr])

    if(length(sig_gene)>0){
        temp <- fdr_table[fdr_table$gene<ofdr&fdr_table$transcript<ofdr,]
        sig_tx <- setNames(as.character(temp$txID), temp$geneID)
        temp <- fdr_table[fdr_table$gene<ofdr&fdr_table$transcript<ofdr,,drop=F]
        sig_tx <- setNames(temp$txID, temp$geneID)
        message("Found ",length(sig_gene)," significant genes with ",length(sig_tx)," significant transcripts (OFDR: ",ofdr,")")
    }else{
        sig_gene <- NULL
        sig_tx <- NULL
        message("No gene passed the screening test. If applicable try to adjust the OFDR level.")
    }

    return_obj <- append(list("sig_gene" = sig_gene, "sig_tx" = sig_tx,
                                       "FDR_table" = fdr_table), dturtle)
    return_obj$used_filtering_options$posthoc_stager <- list("ofdr" = ofdr,
+34 −3
Original line number Diff line number Diff line
#' dmFilter for sparse matrix
#' Filtering for sparse or dense matrix
#'
#' Perform dmFilter-like filtering for sparse matrices
#' Perform dmFilter-like filtering for sparse or dense matrices
#'
#' Runtime optimised version, which can optionally be executed in parallel.
#' Runtime optimised version of `DRIMSeq::dmFilter`, which can optionally be executed in parallel.
#'
#' @param counts Sparse count matrix.
#' @param tx2gene Feature to gene mapping.
@@ -75,3 +75,34 @@ sparse_filter <- function(counts, tx2gene, BPPARAM=BiocParallel::SerialParam(),
    message("Retain ",nrow(counts_new), " of ",nrow(counts)," features.\nRemoved ", nrow(counts)-nrow(counts_new), " features.")
    return(counts_new)
}




#' Read bustools output
#'
#' Read in result files for single-cell data quantified with kallisto and bustools.
#'
#' Adds cell barcodes and feature information to the sparseMatrix
#'
#' @param files Path to result folder of the bustools call
#'
#' @return sparseMatrix with cell barcodes and feature information
readin_bustools <- function(files){
    assertthat::assert_that(dir.exists(files), msg = paste0("Could not find directory '",files,"'. Please provide the bustools output directory as 'files'."))
    mtx <- Sys.glob(paste0(files,"*.mtx"))
    rownames <- Sys.glob(paste0(files,"*.barcodes.txt"))
    colnames <- Sys.glob(paste0(files,"*.genes.txt"))
    assertthat::assert_that(length(mtx)==1, msg = paste0("Could not find a .mtx file in '",files,"'. Please provide the bustools output directory as 'files'."))
    assertthat::assert_that(length(rownames)==1, msg = paste0("Could not find a .barcodes.txt file in '",files,"', which provides cell ids. Please provide the bustools output directory as 'files'."))
    assertthat::assert_that(length(colnames)==1, msg = paste0("Could not find a .genes.txt file in '",files,"', which provides feature information. Please provide the bustools output directory as 'files'."))
    message("\tReading in Matrix.")
    mtx <- Matrix::readMM(mtx)
    message("\tAssigning dimnames.")
    rownames <- scan(rownames, what = "character", quiet=T)
    assertthat::assert_that(length(rownames)==nrow(mtx), msg = "Number of barcodes does not match to matrix.")
    colnames <- scan(colnames, what = "character", quiet=T)
    assertthat::assert_that(length(colnames)==ncol(mtx), msg = "Number of features does not match to matrix.")
    dimnames(mtx) <- list(rownames, colnames)
    return(as(mtx, "dgCMatrix"))
}
+50 −15
Original line number Diff line number Diff line
@@ -136,7 +136,7 @@ getmax <- function(gID, dturtle){
#' @examples ## import_gtf("path_to/your_annotation_file.gtf")
import_gtf <- function(gtf_file){
    assertthat::assert_that(file.exists(gtf_file))
    gtf_grange <- rtracklayer::readGFFAsGRanges(gtf_file)
    gtf_grange <- rtracklayer::import(gtf_file)
    df <- as.data.frame(gtf_grange[gtf_grange$type=="transcript"])
    return(df)
}
@@ -287,34 +287,69 @@ get_by_partition <- function(df, partitioning, FUN, columns=NULL, simplify=T, dr

#' Ensure one-to-one mapping
#'
#' Ensure one-to-one mapping of the two specified columns in the data frame.
#' Ensure one-to-one mapping of the two specified vectors.
#'
#' First checks if every unique value in `name` corresponds with a unique value in `id`.
#' If not, changes the disagreeing values in `name` by extending the label with the `ext` character and a number.
#'
#' @param df A data frame, containing the two provided columns `name` and `id`
#' @param name A column of `df`. If no one-to-one mapping exists, the values of this column will be changed (by extending with `ext`)!
#' @param id A column of df. This is the preferred place for identifier columns, as this column will not be touched in case of a disagreement.
#' @param name A character vector. If no one-to-one mapping exists, the values of this vector will be changed (by extending with `ext`)!
#' @param id A vector, preferably for identifiers. This column will not be touched in case of a disagreement.
#' @param ext The extension character.
#'
#' @return A data frame, where one to one mapping for the two columns is ensured.
#' @return The vector `name`, where one to one mapping for the two vectors is ensured.
#' @export
one_to_one_mapping <- function(df, name, id, ext="_"){
    assertthat::assert_that(is.data.frame(df))
    assertthat::assert_that(all(c(name,id) %in% colnames(df)))
    assertthat::assert_that(is(ext, "character"))
one_to_one_mapping <- function(name, id, ext="_"){
    assertthat::assert_that(length(name)==length(id)&length(id)>0)
    assertthat::assert_that(is(name, "character"))
    assertthat::assert_that(is(ext, "character")&&length(ext)==1)

    not_correct <- lapply(split(df[[id]], df[[name]]), unique)
    not_correct <- lapply(split(id, name), unique)
    not_correct <- not_correct[lengths(not_correct)!=1]

    if(length(not_correct)>0){
        lapply(not_correct, function(x)
            lapply(seq(from = 2, along.with = x[-1]), function(i)
                df[[name]][df[[id]]==x[[i]]] <<- paste0(df[[name]][df[[id]]==x[[i]]],ext,i)))
                name[id==x[[i]]] <<- paste0(name[id==x[[i]]],ext,i)))
        message("Changed ", sum(lengths(not_correct))-length(not_correct), " names.")
        return(df)
        return(name)
    }else{
        message("No changes needed - already one to one mapping.")
        return(df)
        message("No changes needed -> already one to one mapping.")
        return(name)
    }
}

#' Reduce introns in granges
#'
#' Reduces length of introns to 'min_intron_size' in the provided granges.
#'
#' Reduces the size of introns to the square root of the length, but not lower than 'min_intron_size'.
#'
#' @param granges A granges object that shall be altered.
#' @param min_intron_size The minimal intro length, that shall be retained.
#'
#' @return A list containing:
#' - `granges`: The granges object with reduced introns.
#' - `reduced_regions`: A granges object with the ranges of the reduced intron regions and their new size.
#' @export
granges_reduce_introns <- function(granges, min_intron_size){
    assertthat::assert_that(is(granges, "GenomicRanges"))
    assertthat::assert_that(assertthat::is.count(min_intron_size))
    granges_reduced <- granges
    granges_reduced$new_start <- GenomicRanges::start(granges_reduced)
    regions_to_reduce <- GenomicRanges::gaps(granges)
    #exclude first region, if transcript does not start on first base
    if(GenomicRanges::start(regions_to_reduce)[1]==1){
        regions_to_reduce <- regions_to_reduce[-1]
    }
    #compute reduced region size
    #do not artificially inflate regions smaller than min_intron_size
    regions_to_reduce <- regions_to_reduce[GenomicRanges::width(regions_to_reduce)>min_intron_size,]
    regions_to_reduce$new_width <- sapply(ceiling(sqrt(GenomicRanges::width(regions_to_reduce))), FUN = function(x) max(min_intron_size, x))
    for(j in seq_along(regions_to_reduce)){
        x <- regions_to_reduce[j]
        granges_reduced[GenomicRanges::start(granges_reduced)>GenomicRanges::start(x),]$new_start <- granges_reduced[GenomicRanges::start(granges_reduced)>GenomicRanges::start(x),]$new_start-GenomicRanges::width(x)+x$new_width
    }
    GenomicRanges::start(granges) <- granges_reduced$new_start
    GenomicRanges::end(granges) <- GenomicRanges::start(granges)+GenomicRanges::width(granges_reduced)-1
    return(list("granges" = granges, "reduced_regions" = regions_to_reduce))
}
+182 −138

File changed.

Preview size limit exceeded, changes collapsed.

Loading