Commit 173501ee authored by Swati Parekh's avatar Swati Parekh
Browse files

removed extra R file

parent 4903ccfd
Loading
Loading
Loading
Loading

zUMIs-stats-withUMIs.R

deleted100644 → 0
+0 −120
Original line number Diff line number Diff line
#!/usr/bin/env Rscript
print("I am loading useful packages...")
print(Sys.time())
packages <-c("multidplyr","dplyr","tidyr","reshape2","data.table","optparse","parallel","Rsubread","methods","GenomicRanges","GenomicFeatures","GenomicAlignments","AnnotationDbi","ggplot2","cowplot","tibble")
paks<-lapply(packages, function(x) suppressMessages(require(x, character.only = TRUE)))
rm(paks)

# optparse  ---------------------------------------------------------------

option_list = list(
  make_option(c("--out"), type="character", default=NULL,
              help="output directory", metavar="character"),
  make_option(c("--sn"), type="character", default="study",
              help="Study name", metavar="character")
);

opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);

AllCounts=readRDS(paste(opt$out,"/zUMIs_output/expression/",opt$sn,".dgecounts.rds",sep=""))
reads=readRDS(paste(opt$out,"/zUMIs_output/expression/",opt$sn,".tbl.rds",sep=""))

# GeneCounts --------------------------------------------------------------

countGenes <- function(counttable,threshold=1){
  tmp <- counttable
  
  tmp[tmp<threshold]<-0
  tmp[tmp>=threshold]<-1
  
  samples<-as.data.frame(colSums(tmp))
  colnames(samples) <- c("Count")
  samples[,"SampleID"] <- as.factor(row.names(samples))
  row.names(samples) <- NULL
  return(samples)
}

countUMIs <- function(counttable){
  tmp <- counttable
  
  samples<-as.data.frame(colSums(tmp))
  colnames(samples) <- c("Count")
  samples[,"SampleID"] <- as.factor(row.names(samples))
  row.names(samples) <- NULL
  return(samples)
}

genecounts <- data.frame()
umicounts <- data.frame()
for(i in names(AllCounts)){
  genecounts <- rbind.data.frame(genecounts,cbind.data.frame(countGenes(AllCounts[[i]][["umicounts"]]),featureType=i))
  umicounts <- rbind.data.frame(umicounts,cbind.data.frame(countUMIs(AllCounts[[i]][["umicounts"]]),featureType=i))
}
med<-genecounts %>% dplyr::group_by(featureType) %>% dplyr::summarise(n=round(median(Count)))
medUMI<-umicounts %>% dplyr::group_by(featureType) %>% dplyr::summarise(n=round(median(Count)))

ag <- ggplot(genecounts, aes(x=featureType, y=Count, fill=featureType))
ag<-ag+geom_boxplot(notch = T) + geom_text(data=med,aes(x=featureType,y=n,label=n),size=5,vjust=-1,col="white") + scale_fill_manual(values = c("#1A5084","#914614","#118730")) + xlab("") + ylab("Number of genes") + theme_bw() + theme( axis.text = element_text(size=18), axis.title = element_text(size=16),legend.position = "none")
bg <- ggplot(umicounts, aes(x=featureType, y=Count, fill=featureType))
bg<-bg+geom_boxplot(notch = T) + geom_text(data=medUMI,aes(x=featureType,y=n,label=n),size=5,vjust=-1,col="white") + scale_fill_manual(values = c("#1A5084","#914614","#118730")) + xlab("") + ylab("Number of UMIs") + theme_bw() + theme( axis.text = element_text(size=18), axis.title = element_text(size=16),legend.position = "none")
c<-plot_grid(ag,bg,ncol = 2)
ggsave(c,filename = paste(opt$out,"/zUMIs_output/stats/",opt$sn,".geneUMIcounts.pdf",sep=""),width = 10,height = 5)
write.table(genecounts,file = paste(opt$out,"/zUMIs_output/stats/",opt$sn,".genecounts.txt",sep=""),sep="\t",row.names = F,col.names = T)
write.table(umicounts,file = paste(opt$out,"/zUMIs_output/stats/",opt$sn,".UMIcounts.txt",sep=""),sep="\t",row.names = F,col.names = T)


# Feature assignment ------------------------------------------------------

bc <- colnames(AllCounts$intron.exon$readcounts)

## Total number of reads per cell
cellTotal <- reads %>% dplyr::filter(XC %in% bc) %>% dplyr::group_by(XC) %>% dplyr::summarise(Total=length(GE))
p <- ggplot(cellTotal, aes(x=reorder(XC,Total), y=Total))
p<-p+geom_bar(stat = "identity",alpha=0.9,width=0.7) + scale_fill_brewer(labels = c("Exon Mapped","Intron Mapped","Ambiguity", "Intergenic", "Unmapped"), palette="Set2") + xlab("") + ylab("log10(Number of reads)") + scale_y_log10() + theme_bw() + theme(axis.text.x = element_text(size=4,angle = 90),axis.text.y = element_text(size=13), axis.title.y = element_text(size=20))

ggsave(p,filename = paste(opt$out,"/zUMIs_output/stats/",opt$sn,".readspercell.pdf",sep=""),width = 10,height = 7)
write.table(cellTotal,file = paste(opt$out,"/zUMIs_output/stats/",opt$sn,".readspercell.txt",sep=""),sep="\t",row.names = F,col.names = T)
###

##Reads per cell based on their assignment type
cellFeatures<-reads %>% dplyr::filter(XC %in% bc) %>% group_by(XC,GE,assignment,ftype) %>% summarise(r=length(XM),u=length(unique(XM)))
cellFeatures[which(cellFeatures$assignment!="Assigned"),"u"] <- 0

FeaturesBAR <- cellFeatures %>% group_by(XC,assignment,ftype) %>% summarise(rn=sum(r)-sum(u),un=sum(u)) %>% left_join(.,cellTotal,by="XC") %>% dplyr::mutate(Reads=rn/Total,UMIs=un/Total)

dfplot<-melt(FeaturesBAR %>% dplyr::select(XC,assignment,ftype,Reads,UMIs))
dfplot<-dfplot[!(dfplot$ftype=="inex" & dfplot$variable=="UMIs"),]

dfplot$Type <- paste(dfplot$ftype,dfplot$assignment,dfplot$variable,sep="_")
dfplot$Type <- gsub("inex_Unassigned_","",dfplot$Type)
dfplot$Type <- gsub("Ambiguity_Reads","Ambiguity",dfplot$Type)
dfplot$Type <- gsub("NoFeatures_Reads","Intergenic",dfplot$Type)
dfplot$Type <- gsub("Unmapped_Reads","Unmapped",dfplot$Type)
dfplot$Type <- gsub("_Assigned","",dfplot$Type)
dfplot$Type <- factor(dfplot$Type, levels=c("exon_UMIs","exon_Reads","intron_UMIs","intron_Reads","Ambiguity","Intergenic","Unmapped"))

a <- ggplot(dfplot, aes(x=Type, y=value, fill=Type))
a<-a+geom_boxplot(alpha=0.9,width=0.7) + scale_fill_manual(values = c("grey46","tan1","gold1","lightgreen","darkgreen","steelblue1","steelblue4")[7:1]) + xlab("") + ylab("Fraction of reads per cell") + theme_bw() + theme( axis.text = element_text(size=18), axis.title = element_text(size=18), legend.position = "none")

dfplots<-melt(FeaturesBAR %>% dplyr::select(XC,assignment,ftype,rn,un))
dfplots<-dfplots[!(dfplots$ftype=="inex" & dfplots$variable=="un"),]
dfplots$Type <- paste(dfplots$ftype,dfplots$assignment,dfplots$variable,sep="_")
dfplots$Type <- gsub("inex_Unassigned_","",dfplots$Type)
dfplots$Type <- gsub("Ambiguity_rn","Ambiguity",dfplots$Type)
dfplots$Type <- gsub("NoFeatures_rn","Intergenic",dfplots$Type)
dfplots$Type <- gsub("Unmapped_rn","Unmapped",dfplots$Type)
dfplots$Type <- gsub("_Assigned","",dfplots$Type)
dfplots$Type <- gsub("rn","Reads",dfplots$Type)
dfplots$Type <- gsub("un","UMIs",dfplots$Type)
dfplots<-dfplots %>% group_by(Type) %>% summarise(Reads=sum(value),frac=sum(value)/sum(cellTotal$Total)) %>% mutate(studyname="a")

dfplots$Type <- factor(dfplots$Type, levels=c("exon_UMIs","exon_Reads","intron_UMIs","intron_Reads","Ambiguity","Intergenic","Unmapped")[7:1])


b <- ggplot(dfplots, aes(x=studyname, y=frac, fill=Type))
b<-b+geom_bar(stat="identity",alpha=0.9,width=0.5) + scale_fill_manual(values = c("grey46","tan1","gold1","lightgreen","darkgreen","steelblue1","steelblue4")) + xlab("") +ylab(NULL)+ ggtitle("Fraction of reads in the dataset") + theme_bw() + theme( axis.text.x = element_text(size=14),axis.text.y = element_blank(),axis.ticks.y = element_blank(), axis.title = element_text(size=18), legend.position = "bottom",legend.title=element_blank(),plot.title = element_text(hjust=0.5,size=20),legend.text = element_text(size=20))+coord_flip()+ guides(fill = guide_legend(nrow = 1,reverse = T))
d<-plot_grid(c,b,a,ncol = 1,rel_heights  = c(0.3,0.2,0.5))

ggsave(d,filename = paste(opt$out,"/zUMIs_output/stats/",opt$sn,".features.pdf",sep=""),width = 12,height = 9)
write.table(dfplot[,-c(2:3)],file = paste(opt$out,"/zUMIs_output/stats/",opt$sn,".features.txt",sep=""),sep="\t",row.names = F,col.names = T)