Commit c4e53b3b authored by tekath's avatar tekath
Browse files

Started roxygen2 documentation. Partially added dturtle_sc. For real!

parent dba8ade9
Loading
Loading
Loading
Loading
+4 −1
Original line number Diff line number Diff line
@@ -9,4 +9,7 @@ Description: More about what it does (maybe more than one line)
License: What license is it under?
Encoding: UTF-8
LazyData: true
Imports: tximport, DRIMSeq, stageR, stageR
Depends: tximport, DRIMSeq, stageR, GenomicFeatures, assertthat
Imports: matrixStats
RoxygenNote: 7.1.0
Roxygen: list(markdown = TRUE)
+7 −0
Original line number Diff line number Diff line
exportPattern("^[[:alpha:]]+")

import('GenomicFeatures')
import('tximport')
import('DRIMSeq')
import('stageR')
import('assertthat')
importFrom('matrixStats', 'rowVars')
+195 −39
Original line number Diff line number Diff line


count_filtered <- function(drim, drim_filt){
    drim_filt_temp <- dmFilter(drim)
    rm_trans <- nrow(drim@counts)-nrow(drim_filt@counts)
    rm_gene <- length(drim@counts@partitioning)-length(drim_filt@counts@partitioning)

    print(paste0("Removed ", rm_trans," (",formatC(((rm_trans)/nrow(drim@counts))*100, 2, format = "f"),"%) of ",nrow(drim@counts)," transcripts due to filters."))
    print(paste0("Removed ", rm_gene," (",formatC(((rm_gene)/length(drim@counts@partitioning))*100, 2, format = "f"),"%) of ",length(drim@counts@partitioning)," genes due to filters."))
    print(paste0("Of the removed transcripts: ", nrow(drim@counts)-nrow(drim_filt_temp@counts)," (",formatC(((nrow(drim@counts)-nrow(drim_filt_temp@counts))/rm_trans)*100, 2, format = "f"),"%) had no expression."))
    print(paste0("This explains ", length(drim@counts@partitioning)-length(drim_filt_temp@counts@partitioning)," (",formatC(((length(drim@counts@partitioning)-length(drim_filt_temp@counts@partitioning))/rm_gene)*100, 2, format = "f"),"%) of the removed genes."))
    print(paste0("--> Proceed with ",length(drim_filt@counts@partitioning)," genes and ",nrow(drim_filt@counts)," transcripts."))
}

run_drimseq <- function(pd, cols, cond_levels, all_counts = all_counts_dtu, n_core=30){
    samp <- subset(pd, select=cols)
    colnames(samp) <- c("sample_id", "condition")
    condition_levels <- cond_levels
    samp$condition <- factor(samp$condition, levels=condition_levels)
#TODO: add bulk option
#' Title
#'
#' @param path
#' @param ...
#'
#' @return
#' @export
#'
#' @examples
readin_alevin <- function(path,  ...){
    message("Read in ", length(path), " alevin runs.")
    return_obj = list()
    for(i in path){
        browser()
        #temp <- tximport:::readAlevin(files=i, dropInfReps=F, forceSlow=F, ...)
        return_obj <- append(return_obj, tximport(files = i, type = "alevin", ...))
    }

    if(!is.null(names(path))){
        names(return_obj) <- names(path)
    }
    return(return_obj)
}


#TODO: Check if seurat_obj is seurat object!
#' Title
#'
#' @param tx_mat
#' @param seurat_obj
#'
#' @return
#' @export
#'
#' @examples
add_to_seurat <- function(tx_mat, seurat_obj){
    if(!is.list(tx_mat)){
        tx_mat <- list(tx_mat)
    }

    if(is.null(names(tx_mat))|length(unique(names(tx_mat)))!=length(tx_mat)){
        names(tx_mat) <- paste0("obj_",seq_along(tx_mat))
    }

    if(!all(as.logical(lapply(tx_mat, FUN = function(x) is(x, 'sparseMatrix'))))){
        stop("Your tx_mat object contains non sparseMatrix elements.")
    }

    #map cell extensions
    extensions_in_seur <- sum(grepl(pattern = "_", x = Cells(seurat_obj), fixed = T))>length(Cells(seurat_obj))*0.1
    if(extensions_in_seur){
        message("At least some seurat cellnames show a cellname extension.")
        extension_list <- paste0("_", unique(stringi::stri_split_fixed(str= Cells(seurat_obj), pattern = "_", n = 2, simplify = T)[,2]))
        extension_mapping <- names(tx_mat)
        if(length(extension_list)!=length(extension_mapping)){
            stop(paste0("Could not 1:1 map seurat cellname extensions and tx file list.\n",
                        "Try subsetting the seurat object if you do not want to provide tx information of all cells."))
        }
        message("Map extensions:\n\t", paste0(extension_mapping, " --> '", extension_list, "'\n\t"))

        for(i in seq_along(tx_mat)){
            colnames(tx_mat[[i]]) <- paste0(colnames(tx_mat[[i]]), extension_list[[i]])
        }
    }

    all_cells <- unlist(lapply(tx_mat, FUN = colnames))
    dup_cells <- all_cells[duplicated(all_cells)]

    if(length(dup_cells)>0){
        stopstr <- ifelse(length(dup_cells)>10, paste0("Found ", length(dup_cells), " duplicated cellnames!"), paste0("Found duplicated cellnames:\n\t", paste0(dup_cells, collapse = "\n\t")))
        stop(stopstr)
    }
    message("Merging matrices")
    tx_mat <- merge.sparse(tx_mat)

    common_cells <- intersect(Cells(seurat_obj), colnames(tx_mat))
    uniq_cells <- setdiff(colnames(tx_mat), Cells(seurat_obj))
    extra_seurat_cells <- setdiff(Cells(seurat_obj), colnames(tx_mat))
    message("Of ", ncol(tx_mat), " cells, ", length(common_cells), " (", round(length(common_cells)/ncol(tx_mat)*100), "%) could be found in the provided seurat object.")
    message(length(uniq_cells), " (",  round(length(uniq_cells)/ncol(tx_mat)*100),"%) are unique to the tx files.")
    message("The seurat object contains ", length(extra_seurat_cells), " additional cells.")
    if(length(uniq_cells)>0|length(extra_seurat_cells)>0){
        message("Subsetting!")
    }

    seurat_obj <- subset(seurat_obj, cells=common_cells)
    tx_mat <- tx_mat[,common_cells]

    message("Adding assay 'dtutx'")
    seurat_obj[["dtutx"]] <- CreateAssayObject(counts=tx_mat)

    return(seurat_obj)
}






#' Compute the main DRIMSeq results
#'
#' Run the main DRIMSeq pipeline, including generation of a design matrix, gene/feature filtering and running the statistical computations of DRIMSeq (`dmPrecision()`, `dmFit()` and `dmTest()`)
#'
#' @param counts Data frame with gene to transcript mapping and feature counts, where the rows correspond to features.
#' - One column must be called `gene_id` and should contain gene identifiers.
#' - One column must be called `feature_id` and should contain unique feature identifiers.
#' - One column per sample/cell with the count data for the specified features must be present (The names of these columns must match with the identifiers in `id_col`).
#' @param pd Data frame with at least a column of sample/cell identifiers (specified in `id_col`) and the comparison group definition (specified in `cond_col`).
#' @param id_col Name of the column in `pd`, where unique sample/cell identifiers are stored. If `NULL` (default), use rownames of `pd`.
#' @param cond_col Name of the column in `pd`, where the comparison groups are defined. Can contain more than 2 levels/groups, as the groups that should be used are defined in `cond_levels`.
#' @param cond_levels Define two levels/groups of `cond_col`, that should be compared. The order of the levels states the comparison formula (i.e. `cond_col[1]-cond_col[2]`).
#' @param filtering_strategy Define the filtering_strategy used in `dmFilter()`.
#' - `'bulk'`: Predefined strategy for bulk RNAseq data. (default)
#' - `'sc'`: Predefined strategy for single-cell RNAseq data.
#' - `'own'`: Can be used to specify a user-defined strategy via the `...` argument (using the parameters of `dmFilter()`).
#' @param n_core Set the number of CPU cores that should be used in the statistical computations.
#' @param ... If `filtering_strategy=='own'` specify the wished parameters of `dmFilter()` here.
#'
#' @return `DRIM` object with the key results, that can be used in the DTUrtle steps hereafter. The object is just a easily accesible list with the following items:
#' - `drim`: The results of the DRIMSeq statistical computations (`dmTest()`).
#' - `design_full`: The design matrix generated from the specified `pd` columns
#' - `cond_levels`: A copy of the specified comparison groups.
#' - `group`: ?
#' - `pct_exp_tx`: Data frame of the expressed-in percentage of all transcripts. [TODO: split by group?]
#' - `pct_exp_gene`: Data frame of the expressed-in percentage of all genes. [TODO: split by group?]
#'
#' @export
#'
#' @examples
run_drimseq <- function(counts, pd, id_col=NULL, cond_col, cond_levels, filtering_strategy="bulk", n_core=1, ...){

    assert_that(length(cond_levels)==2)
    assert_that(filtering_strategy %in% c("bulk", "sc", "own"), msg = "Please select a valid filtering strategy ('bulk', 'sc' or 'own').")
    assert_that(is.count(n_core))

    if(is.null(id_col)){
        samp <- data.frame("sample_id"=row.names(pd), "condition"=pd[[cond_col]], stringsAsFactors = F)
    }else{
        samp <- data.frame("sample_id"=pd[id_col], "condition"=pd[[cond_col]], stringsAsFactors = F)
    }
    samp$condition <- factor(samp$condition, levels=cond_levels)
    #exclude NA samples!
    exclude <- as.vector(samp$sample_id[is.na(samp$condition)])
    samp <- samp[!is.na(samp$condition),]
    counts <- all_counts[ , !(names(all_counts) %in% exclude)]
    counts <- counts[ , !(names(counts) %in% exclude)]
    if(length(exclude)!=0){
        message("Excluding samples ", paste(exclude, collapse=" "), " for this comparison!")
    }
@@ -29,49 +154,83 @@ run_drimseq <- function(pd, cols, cond_levels, all_counts = all_counts_dtu, n_co

    drim <- dmDSdata(counts = counts, samples = samp)

    #allow 20% of samples to be gene dropouts
    #TODO: Reevaluate!
    total_sample <- round(nrow(samp)*0.8)
    #if smallest_group > 10, set to 10
    smallest_group <- min(min(table(samp$condition)), 10)

    switch(filtering_strategy, sc={
        drim_filt <- dmFilter(drim,min_samps_feature_prop=smallest_group, min_feature_prop=0.05)
    },
    bulk={
        drim_filt <- dmFilter(drim, min_samps_feature_expr=smallest_group, min_feature_expr=1,
                              min_samps_gene_expr=total_sample, min_gene_expr=5,
                              min_samps_feature_prop=smallest_group, min_feature_prop=0.05)
    },
    own={
        drim_filt <- dmFilter(drim, ...)
    })


    count_filtered(drim, drim_filt)

    cores=n_core
    message("Using ", cores, " cpu-cores for computation!")
    pct_exp_tx <- data.frame("gene"=rep(names(drim_filt@counts@partitioning), lengths(drim_filt@counts@partitioning)),
                             "tx"=rownames(drim_filt@counts@unlistData),
                             "pct_exp"=rowSums(drim_filt@counts@unlistData!=0)/ncol(drim_filt@counts@unlistData))

    pct_exp_gene <- data.frame("gene"=names(drim_filt@counts@partitioning),
                               "pct_exp"=sapply(drim_filt@counts@partitioning,
                                                FUN = function(x) sum(colSums(drim_filt@counts@unlistData[names(x),])!=0)/ncol(drim_filt@counts@unlistData)))

    message("Using ", n_core, " CPU cores for computation!")
    design_full <- model.matrix(~condition, data=samp)

    drim_test <- dmPrecision(drim_filt, design=design_full, prec_subset=1, BPPARAM=BiocParallel::MulticoreParam(cores))
    drim_test <- dmFit(drim_test, design=design_full, BPPARAM=BiocParallel::MulticoreParam(cores))
    drim_test <- dmTest(drim_test, coef=colnames(design_full)[2])
    if(n_core>1){
        bbparam <- BiocParallel::MulticoreParam(n_core)
    }else{
        bbparam <- BiocParallel::SerialParam()
    }

    drim_test <- dmPrecision(drim_filt, design=design_full, prec_subset=1, BPPARAM=bbparam, speed=F, add_uniform=T)
    drim_test <- dmFit(drim_test, design=design_full, BPPARAM=bbparam, add_uniform=T)
    drim_test <- dmTest(drim_test, coef=colnames(design_full)[2], BBPARAM=bbparam)

    group <- factor(samp$condition, levels = cond_levels)

    return(list("drim"=drim_test, "design_full"=design_full, "cond_levels"=condition_levels, "group"=group))
    return(list("drim"=drim_test, "design_full"=design_full, "cond_levels"=cond_levels, "group"=group, "pct_exp_tx"=pct_exp_tx, "pct_exp_gene"=pct_exp_gene))
}

smallProportionSD <- function(d, filter=filt) {
    cts <- as.matrix(subset(counts(d), select=-c(gene_id, feature_id)))
    gene.cts <- rowsum(cts, counts(d)$gene_id)
    total.cts <- gene.cts[match(counts(d)$gene_id, rownames(gene.cts)),]
    props <- cts/total.cts
    propSD <- sqrt(matrixStats::rowVars(props))
    propSD < filter
}

#' Title
#'
#' @param drim
#' @param filt
#'
#' @return
#' @export
#'
#' @examples
run_posthoc <- function(drim, filt){
    res.txp.filt <- DRIMSeq::results(drim, level="feature")
    filt <- smallProportionSD(drim)
    filt <- smallProportionSD(drim, filt)
    res.txp.filt$pvalue[filt] <- 1
    res.txp.filt$adj_pvalue[filt] <- 1
    message("Posthoc filtered ", sum(filt, na.rm = TRUE), " transcripts")
    return(res.txp.filt)
}

#' Title
#'
#' @param dtu
#' @param ofdr
#' @param posthoc
#' @param posthoc_filt
#'
#' @return
#' @export
#'
#' @examples
posthoc_and_stager <- function(dtu, ofdr=0.05, posthoc=T, posthoc_filt=0.1){
    # stageR ------------------------------------------------------------------
    res <- DRIMSeq::results(dtu$drim)
    res_txp <- DRIMSeq::results(dtu$drim, level="feature")

@@ -80,7 +239,6 @@ posthoc_and_stager <- function(dtu, ofdr=0.05, posthoc=T, posthoc_filt=0.1){
        res_txp <- run_posthoc(dtu$drim, posthoc_filt)
    }

    no_na <- function(x) ifelse(is.na(x), 1, x)
    res$pvalue <- no_na(res$pvalue)
    res_txp$pvalue <- no_na(res_txp$pvalue)
    pScreen <- res$pvalue
@@ -99,5 +257,3 @@ posthoc_and_stager <- function(dtu, ofdr=0.05, posthoc=T, posthoc_filt=0.1){
    return(append(dtu, list("final_q" = final_q, "final_q_tx" = final_q_tx, "final_q_unfiltered" = final_q_unfiltered)))
}

R/helper_functions.R

0 → 100644
+126 −0
Original line number Diff line number Diff line
#' Title
#'
#' @param x
#'
#' @return
#' @export
#'
#' @examples
no_na <- function(x){
    return(ifelse(is.na(x), 1, x))
}


#' Filter out
#'
#' @param d
#' @param filter
#'
#' @return
#' @export
#'
#' @examples
smallProportionSD <- function(d, filter) {
    cts <- as.matrix(subset(counts(d), select=-c(gene_id, feature_id)))
    gene.cts <- rowsum(cts, counts(d)$gene_id)
    total.cts <- gene.cts[match(counts(d)$gene_id, rownames(gene.cts)),]
    props <- cts/total.cts
    propSD <- sqrt(matrixStats::rowVars(props))
    return(propSD < filter)
}

#' Title
#'
#' @param tx_mat
#'
#' @return
#' @export
#'
#' @examples
merge_sparse <- function(tx_mat) {

    cnnew <- character()
    rnnew <- character()
    x <- vector()
    i <- numeric()
    j <- numeric()

    for (M in tx_mat) {

        cnold <- colnames(M)
        rnold <- rownames(M)

        cnnew <- union(cnnew,cnold)
        rnnew <- union(rnnew,rnold)

        cindnew <- match(cnold,cnnew)
        rindnew <- match(rnold,rnnew)
        ind <- summary(M)
        i <- c(i,rindnew[ind[,1]])
        j <- c(j,cindnew[ind[,2]])
        x <- c(x,ind[,3])
    }
    return(sparseMatrix(i=i,j=j,x=x,dims=c(length(rnnew),length(cnnew)),dimnames=list(rnnew,cnnew)))
}


#TODO: reevaluate and switch to message
#' Title
#'
#' @param drim
#' @param drim_filt
#'
#' @return
#' @export
#'
#' @examples
count_filtered <- function(drim, drim_filt){
    drim_filt_temp <- dmFilter(drim)
    rm_trans <- nrow(drim@counts)-nrow(drim_filt@counts)
    rm_gene <- length(drim@counts@partitioning)-length(drim_filt@counts@partitioning)

    print(paste0("Removed ", rm_trans," (",formatC(((rm_trans)/nrow(drim@counts))*100, 2, format = "f"),"%) of ",nrow(drim@counts)," transcripts due to filters."))
    print(paste0("Removed ", rm_gene," (",formatC(((rm_gene)/length(drim@counts@partitioning))*100, 2, format = "f"),"%) of ",length(drim@counts@partitioning)," genes due to filters."))
    print(paste0("Of the removed transcripts: ", nrow(drim@counts)-nrow(drim_filt_temp@counts)," (",formatC(((nrow(drim@counts)-nrow(drim_filt_temp@counts))/rm_trans)*100, 2, format = "f"),"%) had no expression."))
    print(paste0("This explains ", length(drim@counts@partitioning)-length(drim_filt_temp@counts@partitioning)," (",formatC(((length(drim@counts@partitioning)-length(drim_filt_temp@counts@partitioning))/rm_gene)*100, 2, format = "f"),"%) of the removed genes."))
    print(paste0("--> Proceed with ",length(drim_filt@counts@partitioning)," genes and ",nrow(drim_filt@counts)," transcripts."))
}


#' Title
#'
#' @param gID
#' @param dtu
#'
#' @return
#' @export
#'
#' @examples
get_diff <- function(gID, dtu){
    group <- dtu$group
    y <- data.frame(row.names = rownames(dtu$drim@fit_full[[gID]]))
    y$a <- apply(dtu$drim@fit_full[[gID]][, which(group==levels(group)[1])], 1, unique)
    y$b <- apply(dtu$drim@fit_full[[gID]][, which(group==levels(group)[2])], 1, unique)
    y$diff <- y$a-y$b
    return(y)
}

#' Title
#'
#' @param dtu_table
#' @param dtu
#'
#' @return
#' @export
#'
#' @examples
add_max_delta <- function(dtu_table, dtu){
    getmax <- function(gID){
        y <- get_diff(gID, dtu)
        #get absoulte maximum while preserving sign
        return(y$diff[which.max(abs(y$diff))])
    }

    dtu_table[[paste0("max(",levels(dtu$group)[1], "-",levels(dtu$group)[2],")")]] <- as.numeric(sapply(dtu_table$geneID, FUN = getmax))
    return(dtu_table)
}
+30 −19
Original line number Diff line number Diff line
add_max_delta <- function(dtu_table, dtu){

    getmax <- function(gID){
        y <- data.frame(row.names = rownames(dtu$drim@fit_full[[gID]]))
        y$a <- apply(dtu$drim@fit_full[[gID]][, which(group==levels(group)[1])], 1, unique)
        y$b <- apply(dtu$drim@fit_full[[gID]][, which(group==levels(group)[2])], 1, unique)
        y$diff <- y$a-y$b
        #get absoulte maximum while preserving sign
        return(y$diff[which.max(abs(y$diff))])
    }
    group <- dtu$group
    dtu_table[[paste0("max(",levels(group)[1], "-",levels(group)[2],")")]] <- as.numeric(sapply(dtu_table$geneID, FUN = getmax))
    return(dtu_table)
}

create_dtu_table <- function(dtu, txdf){
#' Title
#'
#' @param dtu
#' @param gene_name_info
#' @param gene_chromosome_info
#' @param gene_description
#'
#' @return
#' @export
#'
#' @examples
create_dtu_table <- function(dtu, gene_name_info=NULL, gene_chromosome_info=NULL, gene_description=NULL){
    max_delta_col <- paste0("max(",dtu$cond_levels[1], "-",dtu$cond_levels[2],")")
    dtu_table <- data.frame("geneID" = as.character(unique(dtu$final_q$geneID)), stringsAsFactors = F)

    if(!is.null(gene_name_info)){
        #TODO: either pull data from seurat or general function that handles named lists or dataframes with 2 columns
        dtu_table$geneName <- sapply(dtu_table$geneID, FUN = function(x) unique(txdf$external_gene_name[txdf$GENEID == x])[1])
    }

    if(!is.null(gene_chromosome_info)){
        #TODO
        dtu_table$chromosome <- sapply(dtu_table$geneID, FUN = function(x) unique(txdf$chromosome_name[txdf$GENEID == x]))
    }

    dtu_table$gene_qval <- sapply(dtu_table$geneID, FUN = function(x) min(dtu$final_q$gene[dtu$final_q$geneID == x]))
    dtu_table$min_tx_qval <- sapply(dtu_table$geneID, FUN = function(x) min(dtu$final_q$transcript[dtu$final_q$geneID == x]))
    dtu_table$n_tx <- sapply(dtu_table$geneID, FUN = function(x) length(dtu$final_q$geneID[dtu$final_q$geneID == x]))
    dtu_table$n_sig_tx <- sapply(dtu_table$geneID, FUN = function(x) length(dtu$final_q_tx$geneID[dtu$final_q_tx$geneID == x]))

    dtu_table <- add_max_delta(dtu_table, dtu)
    dtu_table$pct_expr_gene <- dtu$pct_exp_gene$pct_exp[match(dtu_table$geneID, dtu$pct_exp_gene$gene)]
    dtu_table$max_pct_expr_tx <- sapply(dtu_table$geneID, FUN = function(x) max(dtu$pct_exp_tx$pct_exp[dtu$pct_exp_tx$gene == x]))

    dtu_table <- dtu_table[!is.na(dtu_table[[max_delta_col]]),]

    if(!is.null(gene_description)){
        #TODO
        dtu_table$description <- sapply(dtu_table$geneID, FUN = function(x) unique(txdf$description[txdf$GENEID == x])[1])
    }
    dtu_table <- dtu_table[order(abs(dtu_table[[max_delta_col]]), decreasing = T),]

    return(append(dtu, list("dtu_table"=dtu_table)))
Loading