|
| 1 | +--- |
| 2 | +title: "IMPROVE benchmark results" |
| 3 | +author: "Sara gosline" |
| 4 | +date: "2025-03-27" |
| 5 | +output: html_document |
| 6 | +--- |
| 7 | + |
| 8 | +This document describes basic analysis to carry out from the cross-study analysis work that the IMPROVE team has produced. The results are currently stored in Synapse and we illustrate how to visualize them. |
| 9 | + |
| 10 | +```{r setup, include=FALSE} |
| 11 | +knitr::opts_chunk$set(echo = TRUE) |
| 12 | +library(tidyverse) |
| 13 | +library(synapser) |
| 14 | +library(ggplot2) |
| 15 | +library(ggridges) |
| 16 | +synapser::synLogin() |
| 17 | +
|
| 18 | +``` |
| 19 | + |
| 20 | +## Collect data from synapse |
| 21 | +Natasha uploaded all the results so far to synapse, let's download them systematically into a single table with the model as a new column. |
| 22 | + |
| 23 | +```{r data} |
| 24 | +
|
| 25 | +#allscoreslist<-list(deepttc='syn65676079',graphdrp='syn65676103',lgbm='syn65676119',pathdsp='syn65676139',uno='syn65676159') |
| 26 | +#new scores with extra data |
| 27 | +allscoreslist<-list(deepttc='syn65880080',graphdrp='syn65928973',lgbm='syn65880116',pathdsp='syn65880133',uno='syn65676159') |
| 28 | +
|
| 29 | +##with pancpdo |
| 30 | +allscoreslist<-list(deepttc='syn66323471',graphdrp='syn66323492',lgbm='syn66323510',pathdsp='syn66326173',uno='syn66323527') |
| 31 | +
|
| 32 | +fullres<-do.call(rbind,lapply(names(allscoreslist),function(mod) |
| 33 | + readr::read_csv(synapser::synGet(allscoreslist[[mod]])$path)|>mutate(model=mod))) |
| 34 | +
|
| 35 | +``` |
| 36 | + |
| 37 | +We load all the data into a single table for comparison. Then we can experiment with plotting. |
| 38 | + |
| 39 | +## Comparison plotting |
| 40 | + |
| 41 | +We plot each model by metric and save the results in a file. |
| 42 | + |
| 43 | +```{r model comparison,warning=FALSE, message=FALSE} |
| 44 | +
|
| 45 | +doFullPlot<-function(metric){ |
| 46 | + sr<-fullres|> |
| 47 | + subset(met==metric) |
| 48 | + ##re-rank src samples by mean metric |
| 49 | + mvals<-sr|>group_by(src)|> |
| 50 | + summarize(mvals=mean(value))|> |
| 51 | + arrange(mvals) |
| 52 | + |
| 53 | + if(metric=='r2'){ |
| 54 | + sr$value<-sapply(sr$value,function (x) ifelse(x<(-1),-1,x)) |
| 55 | + } |
| 56 | + |
| 57 | + sr$src=factor(sr$src,levels=mvals$src) |
| 58 | + |
| 59 | + p<-sr|> |
| 60 | + ggplot(aes(x=value,y=model,fill=trg))+ |
| 61 | + ggridges::geom_density_ridges(alpha=0.5)+ |
| 62 | + facet_grid(src~.)+ |
| 63 | + ggtitle(paste0(metric,' by source dataset')) |
| 64 | + |
| 65 | + p1<-sr|> |
| 66 | + ggplot(aes(x=value,y=trg,fill=model))+ |
| 67 | + ggridges::geom_density_ridges(alpha=0.5)+ |
| 68 | + facet_grid(src~.)+ |
| 69 | + ggtitle(paste0(metric,' by source dataset')) |
| 70 | + |
| 71 | + ##now plot by target |
| 72 | + mvals<-sr|>group_by(trg)|> |
| 73 | + summarize(mvals=mean(value))|> |
| 74 | + arrange(mvals) |
| 75 | + sr$trg=factor(sr$trg,levels=mvals$trg) |
| 76 | +
|
| 77 | + p2<-sr|> |
| 78 | + ggplot(aes(x=value,y=model,fill=src))+ |
| 79 | + ggridges::geom_density_ridges(alpha=0.5)+ |
| 80 | + facet_grid(trg~.)+ |
| 81 | + ggtitle(paste0(metric,' by target dataset')) |
| 82 | + |
| 83 | + p3<-sr|> |
| 84 | + ggplot(aes(x=value,y=src,fill=model))+ |
| 85 | + ggridges::geom_density_ridges(alpha=0.5)+ |
| 86 | + facet_grid(trg~.)+ |
| 87 | + ggtitle(paste0(metric,' by target dataset')) |
| 88 | + cowplot::plot_grid(p,p2) |
| 89 | +ggsave(paste0(metric,'ridglines.png'),height=12,width=10) |
| 90 | +cowplot::plot_grid(p1,p3) |
| 91 | +ggsave(paste0(metric,'model_ridglines.png'),height=14,width=14) |
| 92 | +cowplot::plot_grid(p1,p3) |
| 93 | +
|
| 94 | +} |
| 95 | +
|
| 96 | +lapply(unique(fullres$met),function(x) doFullPlot(x)) |
| 97 | +
|
| 98 | +``` |
| 99 | + |
| 100 | +This shows all the results, but maybe we can plot more focused questions across datasets? |
| 101 | + |
| 102 | +## Model system performance |
| 103 | +Which model system performs best across cell lines vs ex vivo? Update this as we get more PDO/PDX data. |
| 104 | + |
| 105 | +```{r model performance, message=FALSE, warning=FALSE} |
| 106 | +exvivo=c('mpnst','beataml','sarcpdo','pancpdo','bladderpdo') |
| 107 | +doModelPlot<-function(metric){ |
| 108 | + sr<-fullres|> |
| 109 | + subset(met==metric) |
| 110 | + ##re-rank src samples by mean metric |
| 111 | + mvals<-sr|>group_by(trg)|> |
| 112 | + summarize(mvals=mean(value))|> |
| 113 | + arrange(mvals) |
| 114 | + |
| 115 | + if(metric=='r2'){ |
| 116 | + sr$value<-sapply(sr$value,function (x) ifelse(x<(-1),-1,x)) |
| 117 | + } |
| 118 | + |
| 119 | + sr$trg=factor(sr$trg,levels=mvals$trg) |
| 120 | + |
| 121 | + sr|>subset(trg%in%exvivo)|> |
| 122 | + ggplot(aes(x=value,alpha=0.8))+ |
| 123 | + geom_histogram()+facet_grid(model~trg)+ |
| 124 | + ggtitle(paste0(metric,' evaluated on ex vivo data')) |
| 125 | + |
| 126 | + |
| 127 | + ggsave(paste0(metric,'exVivoPerformance.png')) |
| 128 | + |
| 129 | + |
| 130 | + sr|>subset(!trg%in%exvivo)|> |
| 131 | + ggplot(aes(x=value,alpha=0.8))+ |
| 132 | + geom_histogram()+facet_grid(model~trg)+ |
| 133 | + ggtitle(paste0(metric,' evaluated on cell line data')) |
| 134 | + |
| 135 | + |
| 136 | + ggsave(paste0(metric,'CellLinePerformance.png')) |
| 137 | +
|
| 138 | +} |
| 139 | +
|
| 140 | +
|
| 141 | +lapply(unique(fullres$met),function(x) doModelPlot(x)) |
| 142 | +
|
| 143 | +``` |
| 144 | + |
| 145 | + |
| 146 | + |
0 commit comments