Commit 21d149f8 authored by smorabit's avatar smorabit
Browse files

more work on ROC code

parent 5b0f82e5
Loading
Loading
Loading
Loading
+6 −1
Original line number Diff line number Diff line
@@ -18,6 +18,11 @@ SelectNetworkGenes <- function(seurat_obj, gene_select="variable", fraction=0.05
    stop(paste0("Invalid selection gene_select: ", gene_select, '. Valid gene_selects are variable, fraction, all, or custom.'))
  }

  # TODO:
  # get genes that are expressed in a fraction of cells in select groups (ie celltypes)
  # this would reduce a bias against genes that are only expressed in underrepresented
  # cell-types

  # handle different selection strategies
  if(gene_select == "fraction"){

@@ -293,7 +298,7 @@ ConstructNetwork <- function(
  seurat_obj <- SetModules(
    seurat_obj, mod_df = data.frame(
      "gene_name" = names(mods),
      "module" = mods,
      "module" = factor(mods, levels=unique(mods)),
      "color" = mods
    )
  )
+46 −7
Original line number Diff line number Diff line
@@ -16,9 +16,12 @@
ConstructMetacells <- function(
  seurat_obj, name='agg', ident.group='seurat_clusters', k=50,
  reduction='umap', assay='RNA',
  slot='counts',  meta=NULL, return_metacell=FALSE
  cells.use = NULL, # if we don't want to use all the cells to make metacells, good for train/test split
  slot='counts',  meta=NULL, return_metacell=FALSE, wgcna_name=NULL
){

  if(is.null(wgcna_name)){wgcna_name <- seurat_obj@misc$active_wgcna}

  # check reduction
  if(!(reduction %in% names(seurat_obj@reductions))){
    stop(paste0("Invalid reduction (", reduction, "). Reductions in Seurat object: ", paste(names(seurat_obj@reductions), collapse=', ')))
@@ -34,6 +37,14 @@ ConstructMetacells <- function(
    stop(paste0("Invalid slot (", slot, "). Valid options for slot: counts, data, scale.data "))
  }

  # subset seurat object by selected cells:
  if(!is.null(cells.use)){
    seurat_full <- seurat_obj
    seurat_obj <- seurat_obj[,cells.use]
  }

  print(dim(seurat_obj))

  reduced_coordinates <- as.data.frame(seurat_obj@reductions[[reduction]]@cell.embeddings)
  nn_map <- FNN::knn.index(reduced_coordinates, k = (k - 1))
  row.names(nn_map) <- row.names(reduced_coordinates)
@@ -107,8 +118,13 @@ ConstructMetacells <- function(
    out <- metacell_obj
  } else{

    # revert to full seurat object if we subsetted earlier
    if(!is.null(cells.use)){
      seurat_obj <- seurat_full
    }

    # add seurat metacell object to the main seurat object:
    seurat_obj <- SetMetacellObject(seurat_obj, metacell_obj)
    seurat_obj <- SetMetacellObject(seurat_obj, metacell_obj, wgcna_name)

    # add other info
    seurat_obj <- SetWGCNAParams(
@@ -117,7 +133,8 @@ ConstructMetacells <- function(
        'metacell_reduction' = reduction,
        'metacell_slot' = slot,
        'metacell_assay' = assay
      )
      ),
      wgcna_name
    )

    out <- seurat_obj
@@ -143,9 +160,19 @@ ConstructMetacells <- function(
#' MetacellsByGroups(pbmc)
MetacellsByGroups <- function(
  seurat_obj, group.by=c('seurat_clusters'), ident.group='seurat_clusters',
  k=50, reduction='umap', assay='RNA', slot='counts'
  k=50, reduction='umap', assay='RNA',
  cells.use = NULL, # if we don't want to use all the cells to make metacells, good for train/test split
  slot='counts', wgcna_name=NULL
){

  if(is.null(wgcna_name)){wgcna_name <- seurat_obj@misc$active_wgcna}

  # subset seurat object by seleted cells:
  if(!is.null(cells.use)){
    seurat_full <- seurat_obj
    seurat_obj <- seurat_obj[,cells.use]
  }

  # setup grouping variables
  if(length(group.by) > 1){
    seurat_meta <- seurat_obj@meta.data[,group.by]
@@ -157,6 +184,12 @@ MetacellsByGroups <- function(
    seurat_obj$metacell_grouping <- as.character(seurat_obj@meta.data[[group.by]])
  }
  groupings <- unique(seurat_obj$metacell_grouping)
  groupings <- groupings[order(groupings)]

  # remove groups that are too small:
  # TODO: add a warning to let the user know that some groups are skipped?
  groupings <- groupings[table(seurat_obj$metacell_grouping) >= 2*k]
  print(groupings)

  # unique meta-data for each group
  meta_df <- as.data.frame(do.call(rbind, strsplit(groupings, '_')))
@@ -179,7 +212,7 @@ MetacellsByGroups <- function(
    seurat_obj = seurat_list,
    name = groupings,
    meta = meta_list,
    MoreArgs = list(k=k, reduction=reduction, assay=assay, slot=slot, return_metacell=TRUE)
    MoreArgs = list(k=k, reduction=reduction, assay=assay, slot=slot, return_metacell=TRUE, wgcna_name=wgcna_name)
  )
  names(metacell_list) <- groupings

@@ -189,8 +222,13 @@ MetacellsByGroups <- function(
  # set idents for metacell object:
  Idents(metacell_obj) <- metacell_obj@meta.data[[ident.group]]

  # revert to full seurat object if we subsetted earlier
  if(!is.null(cells.use)){
    seurat_obj <- seurat_full
  }

  # add seurat metacell object to the main seurat object:
  seurat_obj <- SetMetacellObject(seurat_obj, metacell_obj)
  seurat_obj <- SetMetacellObject(seurat_obj, metacell_obj, wgcna_name)

  # add other info
  seurat_obj <- SetWGCNAParams(
@@ -199,7 +237,8 @@ MetacellsByGroups <- function(
      'metacell_reduction' = reduction,
      'metacell_slot' = slot,
      'metacell_assay' = assay
    )
    ),
    wgcna_name
  )
  seurat_obj
}
+1 −1
Original line number Diff line number Diff line
@@ -680,7 +680,7 @@ motif_df <- data.frame(
)

# get promoter regions for this species (mouse for now)
# THIS DOES NOT WORK IF I USE X & Y
# THIS DOES NOT WORK IF I USE X & Y chromosomes!?#?/???

gene.promoters <- promoters(EnsDb.Mmusculus.v79, filter = ~ gene_biotype == "protein_coding") %>% subset(seqnames %in% c(1:19))

test_ROC.Rmd

0 → 100644
+384 −0
Original line number Diff line number Diff line

# Load data

```{r eval=FALSE}

# conda activate spatial
library(Seurat)
library(tidyverse)
library(cowplot)
library(Matrix)
library(viridis)
library(presto)
library(harmony)
library(RColorBrewer)
library(patchwork)
library(ggpubr)
library(tictoc)
library(RColorBrewer)
library(Hmisc)
library(corrplot)
library(enrichR)
library(GeneOverlap)
library(WGCNA)
enableWGCNAThreads(nThreads = 8)

set.seed(2021)
colfunc <- colorRampPalette(rev(brewer.pal(11, 'Spectral' )))
theme_set(theme_cowplot())
# scp ../../pipelines/scWGCNA/R/* hpc3:/dfs3b/swaruplab/smorabit/analysis/scWGCNA/bin/
setwd("/dfs3b/swaruplab/smorabit/collab/woodlab/cocaine_mouse_2021/Nurr2c_vs_GFP/scWGCNA")

# source all of the scWGCNA scripts:
scripts <- dir("bin/")
scripts <- scripts[scripts != 'scWGCNA.R']
for(script in scripts){
  source(paste0("bin/", script))
}

# directories
data_dir <- "data/"
fig_dir <- 'figures/'

umap_theme <- theme(
  axis.line=element_blank(),
  axis.text.x=element_blank(),
  axis.text.y=element_blank(),
  axis.ticks=element_blank(),
  axis.title.x=element_blank(),
  axis.title.y=element_blank(),
  panel.background=element_blank(),
  panel.border=element_blank(),
  panel.grid.major=element_blank(),
  panel.grid.minor=element_blank(),
  plot.background=element_blank(),
  plot.title = element_text(hjust = 0.5)
)

# load seurat obj from AD NatGen paper:
NucSeq <- readRDS('/dfs3b/swaruplab/smorabit/analysis/AD_NucSeq_2019/batch_correction/liger/update/celltype-analysis/data/NucSeq_batch_correct_seurat.rds')

# load NatGen color scheme:
load('/dfs3b/swaruplab/smorabit/analysis/AD_NucSeq_2019/batch_correction/liger/update/celltype-analysis/data/color_scheme.rda')


# directories
data_dir <- "data/"
fig_dir <- 'figures/'

# load mouse <-> human gene name table:
hg38_mm10_genes <- read.table(
  "/dfs3b/swaruplab/smorabit/resources/hg38_mm10_orthologs_2021.txt",
  sep='\t',
  header=TRUE
)
colnames(hg38_mm10_genes) <-c('hg38_id', 'mm10_id', 'mm10_name', 'hg38_name')
hg38_mm10_genes <- dplyr::select(hg38_mm10_genes, c(hg38_name, mm10_name, hg38_id, mm10_id))

# load scWGCNA testing seurat object
seurat_obj <- readRDS(file='data/test_wgcna_seurat.rds')

# # select celltype:
# cur_celltype <- 'MHb-Neuron'
#
# # seurat obj for this celltype
# seurat_obj <- subset(seurat_obj, cell_type == cur_celltype)

```

Violin Plots for Vivek's grant:

```{r eval=FALSE}

genes <- c("APP", "BACE1", "ADAM10", "BIN1")

plot_list <- VlnPlot(NucSeq, features=genes, split.by='Diagnosis', group.by='Cell.Type', combine=FALSE, split.plot=TRUE, pt.size=0)

for(i in 1:length(plot_list)){
  plot_list[[i]] <- plot_list[[i]] + NoLegend() + xlab('') + ylab('')  +
  stat_compare_means(method='wilcox', label='p.signif')
}

pdf(paste0(fig_dir, 'vlnplot_for_vivek.pdf'), width=5, height=4)
plot_list
dev.off()


```


Human
Run scWGCNA on 80/20 train/test split
project Modules onto test data
run ROC code

```{r eval=FALSE}

################################################################################
# Setup data
#
# Note: all genes have already been Scaled for this Seurat object
################################################################################

set.seed(12345)

# get only control samples
seurat_obj <- subset(NucSeq, Diagnosis == 'Control' & Cell.Type != "PER.END")
table(seurat_obj$Cell.Type, seurat_obj$Sample.ID)

# 80/20 train/test split:
train_prop = 0.8
train_cells <- sample(colnames(seurat_obj), round(train_prop * ncol(seurat_obj)))
seurat_obj$wgcna_train <- ifelse(colnames(seurat_obj) %in% train_cells, 'train', 'test')

# setup training data for scWGCNA:
seurat_obj <- SetupForWGCNA(
  seurat_obj, wgcna_name = "train",
  gene_select  = "fraction",
  fraction = 0.1
)

# construct metacells:
seurat_obj <- MetacellsByGroups(
  seurat_obj = seurat_obj,
  group.by = c("Cell.Type", "Sample.ID"),
  cells.use = rownames(subset(seurat_obj@meta.data, wgcna_train == 'train')),
  k = 25,
  ident.group = 'Cell.Type'
)

# normalize metacells:
seurat_obj <- NormalizeMetacells(seurat_obj)

########################################################################
#  Construct co-expression networks
########################################################################

# Test different soft powers:
seurat_obj <- TestSoftPowers(seurat_obj, group.by='Cell.Type', group_name="ASC")

# construct wgcna network:
seurat_obj <- ConstructNetwork(
  seurat_obj, soft_power=8,
  group.by='Cell.Type', group_name="ASC"
)

# plot the dendrogram
pdf("figures/test_human_ASC_dendro.pdf",height=5, width=8)
PlotDendrogram(seurat_obj, main='ASC')
dev.off()


########################################################################
# Compute Module Eigengenes, module connectivity, and module scores
########################################################################

# compute all MEs in the full single-cell dataset
seurat_obj <- ModuleEigengenes(seurat_obj, group.by.vars="Batch")

# compute module connectivity:
seurat_obj <- ModuleConnectivity(seurat_obj)

# compute module hub gene scores:
seurat_obj <- ModuleExprScore(seurat_obj, n_genes = 25, method='Seurat')

plot_list <- ModuleFeaturePlot(seurat_obj)
pdf("figures/test_MEFeaturePlot_human_hMEs.pdf",height=12, width=12)
wrap_plots(plot_list, ncol=4)
dev.off()

# plot module scores (Seurat)
plot_list <- ModuleFeaturePlot(seurat_obj, features='scores')
pdf("figures/test_MEFeaturePlot_human_scores.pdf",height=12, width=12)
wrap_plots(plot_list, ncol=4)
dev.off()

# save processed object:
saveRDS(seurat_obj, file=paste0(data_dir, 'human_AD_NatGen_scWGCNA.rds'))

```

Make ROC functions

should be able to take one seurat object with a train/test metadata column,
or to take two seurat objects as input


```{r eval=FALSE}

library(pROC)

# get Modules
modules <- GetModules(seurat_obj)
mods <- levels(modules$module)
mods <- mods[mods != 'grey']

seurat_obj$wgcna_train <- ifelse(seurat_obj$wgcna_train == 'train', TRUE, FALSE)

split_col <- 'wgcna_train'
group.by <- 'monocle_clusters_umap_ID'
groups <- as.character(unique(seurat_obj@meta.data[[group.by]]))

# split into two seurat objects based on train & test split:
seurat_train <- seurat_obj[,seurat_obj@meta.data[[split_col]]]
seurat_test <- seurat_obj[,!seurat_obj@meta.data[[split_col]]]

# project modules for train
seurat_train <- ProjectModules(
  seurat_train,
  seurat_ref = seurat_obj,
  group.by.vars="Batch",
  wgcna_name_proj="ROC",
  scale_genes=FALSE, verbose=TRUE
)

# project modules for test
seurat_test <- ProjectModules(
  seurat_test,
  seurat_ref = seurat_obj,
  group.by.vars="Batch",
  wgcna_name_proj="ROC",
  scale_genes=FALSE, verbose=TRUE
)

# get hMEs for ROC comparison:
MEs <- as.data.frame(GetMEs(seurat_train, harmonized=TRUE, wgcna_name="ROC")) %>% mutate(group = seurat_train@meta.data[[group.by]])
MEs_p <- as.data.frame(GetMEs(seurat_test, harmonized=TRUE, wgcna_name="ROC")) %>% mutate(group = seurat_test@meta.data[[group.by]])

# compute average MEs in each group:
avg_MEs <- MEs %>% group_by(group) %>% summarise(across(!!mods, mean))
avg_MEs_p <- MEs_p %>% group_by(group) %>% summarise(across(!!mods, mean))
groups <- avg_MEs$group

# scale each column between 0 & 1
avg_MEs <- avg_MEs %>% summarise(across(!!mods, scale01))
avg_MEs_p <- avg_MEs_p %>% summarise(across(!!mods, scale01))

# convert to binary labels:
thresh <- 0.75
labels <- avg_MEs %>% purrr::map(~ifelse(. >= thresh, TRUE, FALSE))
labels <- as.data.frame(do.call(cbind, labels));
rownames(labels) <- as.character(groups)

predictions <- avg_MEs_p %>% purrr::map(~ifelse(. >= thresh, TRUE, FALSE))
predictions <- as.data.frame(do.call(cbind, predictions));
rownames(predictions) <- as.character(groups)

# loop over modules to compute ROC curves:
plot_df <- data.frame()
conf_df <- data.frame()
auc_list <- list()
mod_colors <- list()
for(cur_mod in mods){
  print(cur_mod)
  cur_color <- modules %>% subset(module == cur_mod) %>% .$color %>% unique
  mod_colors[[cur_mod]] <- cur_color

  # compute ROC
  rocobj <- pROC::roc(labels[,cur_mod], avg_MEs_p[[cur_mod]])
  auc_list[[cur_mod]] <- as.numeric(rocobj$auc)
  # confidence intervals:
  # sens_ci <- ci.se(rocobj)

  cur_df <- data.frame(
    specificity = 1-rocobj$specificities,
    sensitivity = rocobj$sensitivities,
    #auc = rocobj$auc,
    module = cur_mod,
    color = cur_color,
    auc = as.numeric(rocobj$auc)
  #  ci_lo = sens_ci[,1],
  #  ci_med = sens_ci[,2],
  #  ci_hi = sens_ci[,3]

  )
  plot_df <- rbind(plot_df, cur_df)

  cur_conf <- as.data.frame(ci.se(rocobj))
  cur_conf$sensitivity <- 1-as.numeric(rownames(cur_conf))
  cur_conf$module <- cur_mod
  cur_conf$color <- cur_color
  conf_df <- rbind(conf_df, cur_conf)

}
colnames(conf_df)[1:3] <- c('lo', 'mid', 'hi')

# plot the ROC curve
plot_df <- plot_df %>% group_by(module) %>% arrange(sensitivity)
conf_df <- conf_df %>% group_by(module) %>% arrange(sensitivity)
auc_df <- distinct(plot_df[,c('module', 'auc')])

p <- plot_df %>% ggplot(
  aes(x=specificity, y=sensitivity, color=module, fill=module),
) +
  geom_line() +
  geom_ribbon(
    data=conf_df,
    aes(x = sensitivity, ymin=lo, ymax=hi, fill=module),
    inherit.aes=FALSE, alpha=0.4
  ) +
  scale_color_manual(values = unlist(mod_colors)) +
  scale_fill_manual(values = unlist(mod_colors)) +
  scale_x_continuous(breaks = c(0, 0.5, 1), labels=c("0", "0.5", "1")) +
  scale_y_continuous(breaks = c(0, 0.5, 1), labels=c("0", "0.5", "1")) +
  xlab("1 - Specificity (FPR)") + ylab("Sensitivity (TPR)") +
  geom_text(
    data = auc_df,
    aes(color=module),
    x=0.75, y=0.1, label=paste0("AUC: ", format(auc_df$auc, digits=2)),
    inherit.aes=FALSE, size=4, color='black'
  )
  #annotate("text", x=0.75, y=0.1, label=paste0("AUC: ", plot_df$auc))
  # theme(
  #   axis.text = element_blank(),
  #   axis.ticks = element_blank()
  # )



pdf(paste0(fig_dir, 'test_ROC_human.pdf'), width=8, height=6)
p + facet_wrap(~module, ncol=4) + NoLegend()
dev.off()


unlist(auc_list)

```


Project onto human AD dataset:

```{r eval=FALSE}

# load AD NatGen dataset
NucSeq <- readRDS('/dfs3b/swaruplab/smorabit/analysis/AD_NucSeq_2019/batch_correction/liger/update/celltype-analysis/data/NucSeq_batch_correct_seurat.rds')

# keep only a few samples:
NucSeq <- subset(NucSeq, Sample.ID %in% c("Sample-100", "Sample-45"))

NucSeq <- ProjectModules(
  seurat_obj=NucSeq,
  seurat_ref=seurat_obj,
  gene_mapping=hg38_mm10_genes,
  genome1_col="mm10_name",
  genome2_col="hg38_name",
  scale_genes=TRUE,
  wgcna_name_proj="MHb_projected"
)

# compute module expression score & average module expression:
NucSeq <- ModuleExprScore(NucSeq, n_genes = 25, method='Seurat')

# ME featureplot of projected data:
plot_list <- ModuleFeaturePlot(NucSeq, features='scores', order='shuffle', reduction='umap')
pdf("figures/test_MEFeaturePlot_projected_human.pdf",height=12, width=12)
wrap_plots(plot_list, ncol=4)
dev.off()


# ME correlogram of projected data:
pdf("figures/test_ME_correlogram_projected_human.pdf",height=6, width=6)
ModuleCorrelogram(subset(NucSeq, Cell.Type == 'ASC'), sig.level = 0.001, pch.cex=2, features='hMEs')
dev.off()

```