Add in figure plotting in Rmd

sgosline · sgosline · commit 4dd67fade7bc · 2025-05-02T12:55:15.000-07:00
diff --git a/manuscript/improveResultVis.Rmd b/manuscript/improveResultVis.Rmd
@@ -0,0 +1,146 @@
+---
+title: "IMPROVE benchmark results"
+author: "Sara gosline"
+date: "2025-03-27"
+output: html_document
+---
+
+This document describes basic analysis to carry out from the cross-study analysis work that the IMPROVE team has produced. The results are currently stored in Synapse and we illustrate how to visualize them. 
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+library(synapser)
+library(ggplot2)
+library(ggridges)
+synapser::synLogin()
+
+```
+
+## Collect data from synapse
+Natasha uploaded all the results so far to synapse, let's download them systematically into a single table with the model as a new column. 
+
+```{r data} 
+
+#allscoreslist<-list(deepttc='syn65676079',graphdrp='syn65676103',lgbm='syn65676119',pathdsp='syn65676139',uno='syn65676159')
+#new scores with extra data
+allscoreslist<-list(deepttc='syn65880080',graphdrp='syn65928973',lgbm='syn65880116',pathdsp='syn65880133',uno='syn65676159')
+
+##with pancpdo
+allscoreslist<-list(deepttc='syn66323471',graphdrp='syn66323492',lgbm='syn66323510',pathdsp='syn66326173',uno='syn66323527')
+
+fullres<-do.call(rbind,lapply(names(allscoreslist),function(mod)
+  readr::read_csv(synapser::synGet(allscoreslist[[mod]])$path)|>mutate(model=mod)))
+
+```
+
+We load all the data into a single table for comparison. Then we can experiment with plotting. 
+
+## Comparison plotting
+
+We plot each model by metric and save the results in a file. 
+
+```{r model comparison,warning=FALSE, message=FALSE}
+
+doFullPlot<-function(metric){
+    sr<-fullres|>
+      subset(met==metric)
+    ##re-rank src samples by mean metric
+    mvals<-sr|>group_by(src)|>
+      summarize(mvals=mean(value))|>
+      arrange(mvals)
+    
+    if(metric=='r2'){
+      sr$value<-sapply(sr$value,function (x) ifelse(x<(-1),-1,x))
+    }
+    
+    sr$src=factor(sr$src,levels=mvals$src)
+    
+    p<-sr|>
+    ggplot(aes(x=value,y=model,fill=trg))+
+      ggridges::geom_density_ridges(alpha=0.5)+
+      facet_grid(src~.)+
+    ggtitle(paste0(metric,' by source dataset'))
+    
+    p1<-sr|>
+      ggplot(aes(x=value,y=trg,fill=model))+
+      ggridges::geom_density_ridges(alpha=0.5)+
+      facet_grid(src~.)+
+      ggtitle(paste0(metric,' by source dataset'))
+  
+      ##now plot by target
+    mvals<-sr|>group_by(trg)|>
+      summarize(mvals=mean(value))|>
+      arrange(mvals)
+    sr$trg=factor(sr$trg,levels=mvals$trg)
+
+       p2<-sr|>
+    ggplot(aes(x=value,y=model,fill=src))+
+      ggridges::geom_density_ridges(alpha=0.5)+
+      facet_grid(trg~.)+
+    ggtitle(paste0(metric,' by target dataset'))
+       
+           p3<-sr|>
+      ggplot(aes(x=value,y=src,fill=model))+
+      ggridges::geom_density_ridges(alpha=0.5)+
+      facet_grid(trg~.)+
+      ggtitle(paste0(metric,' by target dataset'))
+   cowplot::plot_grid(p,p2)
+ggsave(paste0(metric,'ridglines.png'),height=12,width=10)
+cowplot::plot_grid(p1,p3)
+ggsave(paste0(metric,'model_ridglines.png'),height=14,width=14)
+cowplot::plot_grid(p1,p3)
+
+}
+
+lapply(unique(fullres$met),function(x) doFullPlot(x))
+
+```
+
+This shows all the results, but maybe we can plot more focused questions across datasets?
+
+## Model system performance
+Which model system performs best across cell lines vs ex vivo? Update this as we get more PDO/PDX data. 
+
+```{r model performance, message=FALSE, warning=FALSE}
+exvivo=c('mpnst','beataml','sarcpdo','pancpdo','bladderpdo')
+doModelPlot<-function(metric){
+    sr<-fullres|>
+      subset(met==metric)
+    ##re-rank src samples by mean metric
+    mvals<-sr|>group_by(trg)|>
+      summarize(mvals=mean(value))|>
+      arrange(mvals)
+    
+    if(metric=='r2'){
+      sr$value<-sapply(sr$value,function (x) ifelse(x<(-1),-1,x))
+    }
+    
+    sr$trg=factor(sr$trg,levels=mvals$trg)
+    
+    sr|>subset(trg%in%exvivo)|>
+      ggplot(aes(x=value,alpha=0.8))+
+      geom_histogram()+facet_grid(model~trg)+
+      ggtitle(paste0(metric,' evaluated on ex vivo data'))
+    
+  
+  ggsave(paste0(metric,'exVivoPerformance.png'))
+  
+      
+    sr|>subset(!trg%in%exvivo)|>
+      ggplot(aes(x=value,alpha=0.8))+
+      geom_histogram()+facet_grid(model~trg)+
+      ggtitle(paste0(metric,' evaluated on cell line data'))
+    
+  
+  ggsave(paste0(metric,'CellLinePerformance.png'))
+
+}
+
+
+lapply(unique(fullres$met),function(x) doModelPlot(x))
+
+```
+
+
+