Commit ad54ceab authored by Ilya Korsunsky's avatar Ilya Korsunsky
Browse files

added functions to do pairwise group comparisons

parent 75a11045
Loading
Loading
Loading
Loading
+4 −4
Original line number Diff line number Diff line
@@ -14,7 +14,7 @@ Authors@R: c(
Description: Scalable implementation of the Wilcoxon rank sum test and auROC statistic. Interfaces to dense and sparse matrices, as well as genomics analysis frameworks Seurat and SingleCellExperiment. 
License: GPL-3
Encoding: UTF-8
Depends: R (>= 3.4.0), Rcpp
Depends: R (>= 3.4.0), Rcpp, data.table
LinkingTo: Rcpp, RcppArmadillo
Imports: 
    dplyr, 
@@ -24,7 +24,8 @@ Imports:
    Matrix,
    rlang,
    stats, 
    utils
    utils,
    DESeq2
RoxygenNote: 6.1.1
Suggests: 
    knitr,
@@ -34,7 +35,6 @@ Suggests:
    SingleCellExperiment, 
    SummarizedExperiment,
    broom,
    BiocStyle,
    DESeq2
    BiocStyle
NeedsCompilation: yes
VignetteBuilder: knitr
+7 −0
Original line number Diff line number Diff line
@@ -11,10 +11,17 @@ S3method(wilcoxauc,SingleCellExperiment)
S3method(wilcoxauc,default)
S3method(wilcoxauc,seurat)
export("%>%")
export(collapse_counts)
export(compute_hash)
export(nnzeroGroups)
export(pseudobulk_deseq2)
export(pseudobulk_one_vs_all)
export(pseudobulk_pairwise)
export(rank_matrix)
export(sumGroups)
export(summarize_dge_pairs)
export(top_markers)
export(top_markers_dds)
export(wilcoxauc)
import(Rcpp)
importClassesFrom(Matrix,TsparseMatrix)
+100 −30
Original line number Diff line number Diff line
@@ -37,35 +37,49 @@ collapse_counts <- function(counts_mat, meta_data, varnames) {
    design_collapsed$sample_id <- NULL
    return(list(counts_mat = counts_collapsed, meta_data = design_collapsed))
}
#' @export
pseudobulk_deseq2 <- function(dge_formula, meta_data, counts_df, verbose=TRUE, 
                   min_counts_per_sample=10, present_in_min_samples=5, collapse_background=TRUE, vals_test=NULL) {
    message('WARNING: meta_data should only contain pseudobulk identifying variables')

    ## filter low expressed genes
    genes_keep <- which(Matrix::rowSums(counts_df >= min_counts_per_sample) >= present_in_min_samples)
#' @export
pseudobulk_pairwise <- function(dge_formula, counts_df, meta_data, contrast_var, vals_test, verbose) {
    lapply(vals_test, function(foreground_id) {
        background_ids <- as.character(unique(meta_data$seurat_annotations)) %>% setdiff(foreground_id)
        lapply(background_ids, function(background_id) {
            if (verbose) {
        message(sprintf('Filtered out %d genes, analyzing %d genes', nrow(counts_df) - length(genes_keep), length(genes_keep)))
                message(sprintf('%s vs %s', foreground_id, background_id)) 
            }
    counts_df <- counts_df[genes_keep, ]
            suppressMessages({suppressWarnings({
                idx_use <- which(meta_data[[contrast_var]] %in% c(foreground_id, background_id))
                design <- meta_data[idx_use, ]
                
    ## assume that the first variable in formula is the main contrast variable
    all_vars <- unlist(strsplit(tail(as.character(dge_formula), 1), split = ' \\+ '))
    if (verbose) {
        message(sprintf('All vars: %s', paste(all_vars, collapse = ', ')))
    }
    contrast_var <- head(all_vars, 1)
    if (verbose) {
        message(sprintf('Contrast var: %s', contrast_var))
    }
    if (is.null(vals_test)) {
        vals_test <- unique(meta_data[[contrast_var]])    
    } else {
        if (any(!vals_test %in% unique(meta_data[[contrast_var]]))) {
            stop('vals_test must be values in the contrast var')    
        }
                design[[contrast_var]] <- factor(ifelse(design[[contrast_var]] == foreground_id,
                                                   paste0('cluster_', foreground_id), 
                                                   'background'))
                ## Do DGE with DESeq2
                dds <- DESeqDataSetFromMatrix(
                    countData = counts_df[, idx_use],
                    colData = design,
                    design = dge_formula) %>% 
                    DESeq2::DESeq()

                ## Get results 
                contrast_name <- grep('cluster.*_vs_background', resultsNames(dds), value = TRUE)
                dge_res <- results(dds, name = contrast_name) %>% 
                        data.frame() %>% 
                        tibble::rownames_to_column('feature') %>% 
                        dplyr::arrange(-stat) %>% 
                        dplyr::mutate(group1 = foreground_id, group2 = background_id)

            })})
            return(dge_res)        
        }) %>% 
            purrr::reduce(rbind) 
    }) %>% 
    purrr::reduce(rbind) %>% 
    dplyr::select(group1, group2, feature, dplyr::everything()) %>% 
    dplyr::arrange(group1, -stat)
}

#' @export
pseudobulk_one_vs_all <- function(dge_formula, counts_df, meta_data, contrast_var, vals_test, collapse_background, verbose) {
    Reduce(rbind, lapply(vals_test, function(foreground_id) {
        if (verbose) {
            message(foreground_id)      
@@ -83,7 +97,6 @@ pseudobulk_deseq2 <- function(dge_formula, meta_data, counts_df, verbose=TRUE,
                design <- res$meta_data
                counts_df <- res$counts_mat                
            }
#             res <- collapse_counts(counts_df, design, all_vars)
                        
            ## Do DGE with DESeq2
            dds <- DESeqDataSetFromMatrix(
@@ -102,7 +115,51 @@ pseudobulk_deseq2 <- function(dge_formula, meta_data, counts_df, verbose=TRUE,
        })})
        return(dge_res)
    })) %>% 
    dplyr::select(group, feature, dplyr::everything())
    dplyr::select(group, feature, dplyr::everything()) %>% 
    dplyr::arrange(group, -stat)
    
}



#' @export
pseudobulk_deseq2 <- function(dge_formula, meta_data, counts_df, verbose=TRUE, 
                              min_counts_per_sample=10, present_in_min_samples=5, 
                              collapse_background=TRUE, vals_test=NULL,
                              mode=c('one_vs_all', 'pairwise')[1]) {
    message('WARNING: meta_data should only contain pseudobulk identifying variables')
    
    ## filter low expressed genes
    genes_keep <- which(Matrix::rowSums(counts_df >= min_counts_per_sample) >= present_in_min_samples)
    if (verbose) {
        message(sprintf('Filtered out %d genes, analyzing %d genes', nrow(counts_df) - length(genes_keep), length(genes_keep)))
    }
    counts_df <- counts_df[genes_keep, ]
    
    ## assume that the first variable in formula is the main contrast variable
    all_vars <- unlist(strsplit(tail(as.character(dge_formula), 1), split = ' \\+ '))
    if (verbose) {
        message(sprintf('All vars: %s', paste(all_vars, collapse = ', ')))
    }
    contrast_var <- head(all_vars, 1)
    if (verbose) {
        message(sprintf('Contrast var: %s', contrast_var))
    }
    if (is.null(vals_test)) {
        vals_test <- as.character(unique(meta_data[[contrast_var]]))
    } else {
        if (any(!vals_test %in% unique(meta_data[[contrast_var]]))) {
            stop('vals_test must be values in the contrast var')    
        }
    }
    
    if (mode == 'one_vs_all') {
        res <- pseudobulk_one_vs_all(dge_formula, counts_df, meta_data, contrast_var, vals_test, collapse_background, verbose)
        
    } else if (mode == 'pairwise') {
        res <- pseudobulk_pairwise(dge_formula, counts_df, meta_data, contrast_var, vals_test, verbose)
    }
    return (res)
}

#' @export
@@ -121,3 +178,16 @@ top_markers_dds <- function(res, n=10, pval_max=1, padj_max=1, lfc_min=1) {
        tidyr::spread(.data$group, .data$feature, fill = NA) %>% 
        identity()
}

#' @export
summarize_dge_pairs <- function(dge_res, mode=c('min', 'max')[1]) {
    dge_res <- data.table(dge_res)
    switch(
        mode, 
        min = data.table(dge_res)[, head(.SD[order(stat)], 1), by = .(group1, feature)],
        max = data.table(dge_res)[, head(.SD[order(-stat)], 1), by = .(group1, feature)]
    ) %>% 
        dplyr::select(-group2) %>% 
        dplyr::rename(group = group1) %>% 
        dplyr::arrange(group, -stat)
}
 No newline at end of file
+1127 −242

File changed.

Preview size limit exceeded, changes collapsed.