Commit 813de833 authored by tekath's avatar tekath
Browse files

Runtime optimized sparse filtering.

parent 97c4c33f
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
@@ -6,10 +6,10 @@ Author: Tobias Tekath
Maintainer: The package maintainer <yourself@somewhere.net>
Description: More about what it does (maybe more than one line)
    Use four spaces when indenting paragraphs within the Description.
License: What license is it under?
License: GPL (>= 3)
Encoding: UTF-8
LazyData: true
Depends: R (>= 3.3.0)
Imports: tximport, DRIMSeq, stageR, matrixStats, GenomicFeatures, formattable, DT, Gviz, assertthat, Matrix
Depends: R (>= 3.4.0)
Imports: tximport, DRIMSeq, stageR, rtracklayer, formattable, DT, Gviz, assertthat, Matrix
RoxygenNote: 7.1.0
Roxygen: list(markdown = TRUE)
+10 −11
Original line number Diff line number Diff line
exportPattern("^[[:alpha:]]+")
# Generated by roxygen2: do not edit by hand

import('GenomicFeatures')
import('tximport')
import('DRIMSeq')
import('stageR')
import('formattable')
import('DT')
import('Gviz')
import('assertthat')
importFrom('matrixStats', 'rowVars')
importFrom('Matrix', 'summary')
export(combine_to_matrix)
export(create_dtu_table)
export(import_counts)
export(import_gtf)
export(move_columns_to_front)
export(posthoc_and_stager)
export(rm_version)
export(run_drimseq)
export(sparse_filter)
+403 −68

File changed.

Preview size limit exceeded, changes collapsed.

+98 −110
Original line number Diff line number Diff line
#' Title
#' Replaces NA values by 1.
#'
#' @param x
#' @param x Vector of values (i.e. pvalues).
#'
#' @return
#' @export
#'
#' @examples
#' @return Vector of values with NAs replaced.
no_na <- function(x){
    return(ifelse(is.na(x), 1, x))
}


#TODO: test
#' Filter out
#'
#' @param d
#' @param filter
#'
#' @return
#' @export
#'
#' @examples
smallProportionSD <- function(d, filter) {
    cts <- as.matrix(subset(counts(d), select=-c(gene_id, feature_id)))
    gene.cts <- rowsum(cts, counts(d)$gene_id)
    total.cts <- gene.cts[match(counts(d)$gene_id, rownames(gene.cts)),]
    browser()
    cts <- as.matrix(subset(DRIMSeq::counts(d), select=-c("gene_id", "feature_id")))
    gene.cts <- rowsum(cts, DRIMSeq::counts(d)$gene_id)
    total.cts <- gene.cts[match(DRIMSeq::counts(d)$gene_id, rownames(gene.cts)),]
    props <- cts/total.cts
    propSD <- sqrt(matrixStats::rowVars(props))
    #propSD <- sqrt(matrixStats::rowVars(props))

    #directly call SD?
    propSD <- sqrt(apply(props,1,var))

    return(propSD < filter)
}

#' Title
#'
#' @param tx_mat
#' Merge two or more sparse matrices.
#'
#' @return
#' @export
#' @param tx_mat List of sparse matrices that shall be merged.
#'
#' @examples
#' @return Sparse matrix, containing the information of all provided matrices.
merge_sparse <- function(tx_mat) {

    cnnew <- character()
    rnnew <- character()
    x <- vector()
@@ -46,7 +41,7 @@ merge_sparse <- function(tx_mat) {
    j <- numeric()

    for (mat in tx_mat) {

        if(!is.null(mat)){
            cnold <- colnames(mat)
            rnold <- rownames(mat)
            cnnew <- union(cnnew,cnold)
@@ -59,134 +54,127 @@ merge_sparse <- function(tx_mat) {
            j <- c(j,cindnew[ind[,2]])
            x <- c(x,ind[,3])
        }
    return(sparseMatrix(i=i,j=j,x=x,dims=c(length(rnnew),length(cnnew)),dimnames=list(rnnew,cnnew)))
    }
    return(Matrix::sparseMatrix(i=i,j=j,x=x,dims=c(length(rnnew),length(cnnew)),dimnames=list(rnnew,cnnew)))
}


#' Title
#' Add tx2gene
#'
#' Add a data frame as feature level metadata to the Seurat active assay.
#'
#' @param seurat_obj
#' @param tx2gene
#'
#' @return
#' @export
#'
#' @examples
#' @return Seurat object with added data.
seurat_add_tx2gene <- function(seurat_obj, tx2gene){
    assay_name <- seurat_obj@active.assay
    order <- rownames(seurat_obj[[assay_name]]@meta.features)
    tx2gene <- tx2gene[match(order, tx2gene[[1]]),]
    assert_that(are_equal(nrow(seurat_obj[[assay_name]]@meta.features), nrow(tx2gene)), msg = "'tx2gene' data frame does not contain information for all transcripts.")
    assertthat::assert_that(nrow(seurat_obj[[assay_name]]@meta.features) == nrow(tx2gene), msg = "'tx2gene' data frame does not contain information for all transcripts.")
    seurat_obj[[assay_name]]@meta.features <- cbind(seurat_obj[[assay_name]]@meta.features,tx2gene)
    return(seurat_obj)
}


#' Title
#TODO: Add more posthoc filters
#' Posthoc filtering
#'
#' @param drim
#' @param filt
#' Perform posthoc filtering.
#'
#' @return
#' @export
#' Sets pvalue and adjusted pvalue of 'filtered' elements to 1.
#' @param drim
#' @param filt Threshold
#'
#' @examples
#' @return A filtered `DRIMSeq::results()` data frame.
run_posthoc <- function(drim, filt){
    res.txp.filt <- DRIMSeq::results(drim, level="feature")
    res_txp_filt <- DRIMSeq::results(drim, level="feature")
    filt <- smallProportionSD(drim, filt)
    res.txp.filt$pvalue[filt] <- 1
    res.txp.filt$adj_pvalue[filt] <- 1
    res_txp_filt$pvalue[filt] <- 1
    res_txp_filt$adj_pvalue[filt] <- 1
    message("Posthoc filtered ", sum(filt, na.rm = TRUE), " transcripts")
    return(res.txp.filt)
}

#TODO: reevaluate and switch to message
#' Title
#'
#' @param drim
#' @param drim_filt
#'
#' @return
#' @export
#'
#' @examples
count_filtered <- function(drim, drim_filt){
    drim_filt_temp <- dmFilter(drim)
    rm_trans <- nrow(drim@counts)-nrow(drim_filt@counts)
    rm_gene <- length(drim@counts@partitioning)-length(drim_filt@counts@partitioning)

    print(paste0("Removed ", rm_trans," (",formatC(((rm_trans)/nrow(drim@counts))*100, 2, format = "f"),"%) of ",nrow(drim@counts)," transcripts due to filters."))
    print(paste0("Removed ", rm_gene," (",formatC(((rm_gene)/length(drim@counts@partitioning))*100, 2, format = "f"),"%) of ",length(drim@counts@partitioning)," genes due to filters."))
    print(paste0("Of the removed transcripts: ", nrow(drim@counts)-nrow(drim_filt_temp@counts)," (",formatC(((nrow(drim@counts)-nrow(drim_filt_temp@counts))/rm_trans)*100, 2, format = "f"),"%) had no expression."))
    print(paste0("This explains ", length(drim@counts@partitioning)-length(drim_filt_temp@counts@partitioning)," (",formatC(((length(drim@counts@partitioning)-length(drim_filt_temp@counts@partitioning))/rm_gene)*100, 2, format = "f"),"%) of the removed genes."))
    print(paste0("--> Proceed with ",length(drim_filt@counts@partitioning)," genes and ",nrow(drim_filt@counts)," transcripts."))
    return(res_txp_filt)
}


#' Title
#' Get the transcript-wise proportion differnces of the specified gene
#'
#' @param gID
#' @param dtu
#' @param gID gene identifier
#' @param dturtle dturtle object
#'
#' @return
#' @export
#'
#' @examples
get_diff <- function(gID, dtu){
    group <- dtu$group
    y <- data.frame(row.names = rownames(dtu$drim@fit_full[[gID]]))
    y$a <- apply(dtu$drim@fit_full[[gID]][, which(group==levels(group)[1])], 1, unique)
    y$b <- apply(dtu$drim@fit_full[[gID]][, which(group==levels(group)[2])], 1, unique)
#' @return Data frame with the transcript-wise proportion differences.
get_diff <- function(gID, dturtle){
    group <- dturtle$group
    y <- data.frame(row.names = rownames(dturtle$drim@fit_full[[gID]]))
    y$a <- apply(dturtle$drim@fit_full[[gID]][, which(group==levels(group)[1])], 1, unique)
    y$b <- apply(dturtle$drim@fit_full[[gID]][, which(group==levels(group)[2])], 1, unique)
    y$diff <- y$a-y$b
    return(y)
}

#' Title
#' Add a column specifying the maximal difference between the two comparison groups
#'
#' @param dtu_table
#' @param dtu
#'
#' @return
#' @export
#' @param dtu_table The dtu data frame where the column shall be added.
#' @param dturtle The corresponding dturtle object the `dtu_table` originates from.
#'
#' @examples
add_max_delta <- function(dtu_table, dtu){
#' @return A dtu data frame with the added column.
add_max_delta <- function(dtu_table, dturtle){
    getmax <- function(gID){
        y <- get_diff(gID, dtu)
        y <- get_diff(gID, dturtle)
        #get absoulte maximum while preserving sign
        return(y$diff[which.max(abs(y$diff))])
    }

    dtu_table[[paste0("max(",levels(dtu$group)[1], "-",levels(dtu$group)[2],")")]] <- as.numeric(sapply(dtu_table$geneID, FUN = getmax))
    dtu_table[[paste0("max(",levels(dturtle$group)[1], "-",levels(dturtle$group)[2],")")]] <- as.numeric(sapply(dtu_table$geneID, FUN = getmax))
    return(dtu_table)
}


#TODO: Enhance!
#' Title
#' Parse a gtf file and return a transcript level dataframe.
#'
#' @param gtf_file
#' @param gtf_file Path to the gtf/gff file that shall be analysed.
#'
#' @return
#' @return A data frame of the availble transcript level information (e.g. the tx2gene mapping information)
#' @export
#'
#' @examples
create_tx2gene <- function(gtf_file){
    txdb <- GenomicFeatures::makeTxDbFromGFF(gtf_file)
    tx2gene <- select(txdb, keys(txdb, keytype = "TXNAME"), "GENEID", "TXNAME")
    return(tx2gene)
#' @examples create_tx2gene("path_to/your_annotation_file.gtf")
import_gtf <- function(gtf_file){
    gtf_grange <- rtracklayer::readGFFAsGRanges("/data/sperm_test/alevin/gencode.v33.annotation.gtf")
    df <- as.data.frame(gtf_grange[gtf_grange$type=="transcript"])
    return(df)
}


#' Reorder columns
#'
#' Reorder the columns of a data frame.
#'
#' Sets the desired `columns` of the dataframe `df` to the first columns. Does not change the order of the others.
#'
#' @param df The data frame that shall be reordered.
#' @param columns One or multiple column names that should be moved to the front of the dataframe. Order is kept.
#'
#' @return Data frame with reordered columns.
#' @export
#'
#' @examples move_columns_to_fron(df, c("new_first_column", "new_second_column"))
move_columns_to_front <- function(df, columns){
    assertthat::assert_that(all(columns %in% colnames(df)), msg = "Could not find all provided column names in data frame.")
    col_order <- setdiff(colnames(df), columns)
    return(df[,c(columns, col_order)])
}


#' Title
#' Remove ensembl version number
#'
#' @param x
#' Remove the version number of ensembl gene/transcript identifiers.
#'
#' @return
#' Removes everything beyond the first dot ('.') in each provied identifier.
#'
#' @param x Vector of identifiers.
#'
#' @return Vector of identifiers without version numbers.
#' @export
#'
#' @examples
rm_tx_version <- function(x){
#' @examples rm_version(c("ENSG00000000001.5","ENST00000000001.2"))
rm_version <- function(x){
    return(sub("\\..*", "", x))
}
+1 −0
Original line number Diff line number Diff line
@@ -7,6 +7,7 @@
#'
#' @return
#' @family DTUrtle visualization
#' @export
#' @seealso
#'
#' @examples
Loading