initial commit of new figure outlines

sgosline · sgosline · commit 6d549b9965f5 · 2025-06-04T12:20:25.000-07:00
diff --git a/manuscript/coderdataResultsFunctions.R b/manuscript/coderdataResultsFunctions.R
@@ -0,0 +1,139 @@
+##figure ploting functions incorporates and standardizes the calls made across figures
+
+library(ggplot2)
+library(dplyr)
+library(ggridges)
+library(synapser)
+
+##COLORS: standardize here
+modelcolors <- c()
+datasetcolors <- c()
+
+exvivo = c('mpnst','beataml','sarcpdo','pancpdo','bladderpdo')
+
+synapser::synLogin()
+
+getModelPerformanceData <- function(){
+
+  allscoreslist <- list(deepttc = 'syn65880080',graphdrp = 'syn65928973',lgbm = 'syn65880116',pathdsp = 'syn65880133',uno = 'syn65676159')
+
+  ##with pancpdo
+  allscoreslist <- list(deepttc = 'syn66323471',graphdrp = 'syn66323492',lgbm = 'syn66323510',pathdsp = 'syn66326173',uno = 'syn66323527')
+
+  fullres <- do.call(rbind,lapply(names(allscoreslist),function(mod)
+    readr::read_csv(synapser::synGet(allscoreslist[[mod]])$path) |> mutate(model = mod)))
+
+  fullres <- fullres |>
+    mutate(withinDataset = ifelse(src == trg,TRUE,FALSE))
+
+  #lets remove same-dataset data
+  cdres <- subset(fullres,!withinDataset)
+
+  ##lets remove ex vivo training
+  cdres <- subset(cdres,!src %in% c('mpnst','beataml'))
+
+  return(cdres)
+}
+
+
+
+###these files are very big so i'm not sure how to deal with them.
+getModelPredictionData<-function(dset='lgbm'){
+
+  preds <- list(deepttc = 'syn68149793', graphdrp = 'syn68146828', lgbm = 'syn68149807', pathdsp = 'syn66772452', uno = 'syn68149809')
+
+
+  fullres <- do.call(rbind,lapply(dset,function(mod)
+    readr::read_csv(synapser::synGet(preds[[mod]])$path) |> mutate(model = mod)))
+
+  return(preds)
+}
+
+#this function plots a single metric by all the possible values
+#
+ridgelineMetricPlots <- function(metric,dataset=cdres, prefix='all'){
+
+    sr <- dataset |>
+    subset(met == metric)
+
+
+  ##facet by source - compare performance across a single source
+
+  ##re-rank src samples by mean metrics
+  mvals <- sr |> group_by(src) |>
+    summarize(mvals = mean(value)) |>
+    arrange(mvals)
+
+  if (metric == 'r2') {
+    sr$value <- sapply(sr$value,function(x) ifelse(x < (-1),-1,x))
+  }
+
+  sr$src = factor(sr$src,levels = mvals$src)
+
+  #compare models by source dataset
+  p1 <- sr |>
+    ggplot(aes(x = value,y = trg,fill = model)) +
+    ggridges::geom_density_ridges(alpha = 0.5) +
+    facet_grid(src~.) +
+    ggtitle(paste0(metric,' by source dataset'))
+
+  ##now we rerank by target dataset and evaluate by target
+  mvals <- sr |> group_by(trg) |>
+    summarize(mvals = mean(value)) |>
+    arrange(mvals)
+  sr$trg = factor(sr$trg,levels = mvals$trg)
+
+  #plot source by target data
+  p3 <- sr |>
+    ggplot(aes(x = value,y = src,fill = model)) +
+    ggridges::geom_density_ridges(alpha = 0.5) +
+    facet_grid(trg~.) +
+    ggtitle(paste0(metric,' by target dataset'))
+
+  return(list(src=p1,trg=p3))
+}
+
+
+##here we have to interrogate the results to visualize how specific drugs are behaving
+performanceByDrugOrSample<-function(){
+
+}
+
+
+##do we still need this function?
+
+doModelPlot <- function(metric, dataset=cdres){
+
+  sr <- dataset |>
+    subset(met == metric)
+  ##re-rank src samples by mean metric
+  mvals <- sr |>
+    group_by(trg) |>
+    summarize(mvals = mean(value)) |>
+    arrange(mvals)
+
+  if(metric == 'r2') {
+    sr$value <- sapply(sr$value,function (x) ifelse(x<(-1),-1,x))
+  }
+
+  sr$trg = factor(sr$trg,levels = mvals$trg)
+
+  sr |>
+    subset(trg %in% exvivo) |>
+    ggplot(aes(x = value,alpha = 0.8)) +
+    geom_histogram() + facet_grid(model~trg) +
+    ggtitle(paste0(metric,' evaluated on ex vivo data'))
+
+
+  ggsave(paste0(metric,'exVivoPerformance.png'))
+
+
+  sr |> subset(!trg %in% exvivo) |>
+    ggplot(aes(x = value,alpha = 0.8)) +
+    geom_histogram() + facet_grid(model~trg) +
+    ggtitle(paste0(metric,' evaluated on cell line data'))
+
+
+  ggsave(paste0(metric,'CellLinePerformance.png'))
+
+}
diff --git a/manuscript/figure3CellLinePlots.Rmd b/manuscript/figure3CellLinePlots.Rmd
@@ -0,0 +1,100 @@
+---
+title: "IMPROVE cell line results"
+author: "Sara gosline"
+date: "2025-03-27"
+output: html_document
+---
+
+This document focuses on results we can glean from the cell line specific analysis using the IMPROVE framework and coderdata
+```{r setup, include = FALSE}
+knitr::opts_chunk$set(echo  =  TRUE)
+library(tidyverse)
+library(ggplot2)
+source('coderdataResultsFunctions.R')
+
+```
+
+## Collect cross-model performance data and try out plotting
+Currently the cross-model data results have been uploaded to synapse. Please request access on the [synapse team site](https://www.synapse.org/Team:3545388). 
+
+```{r data, message=FALSE, echo=FALSE, warning=FALSE} 
+  cdres <- getModelPerformanceData()
+
+  metrics <- c('pcc','scc')
+  res <- lapply(metrics,function(x) {
+    res <- doFullPlot(x,cdres)
+    cowplot::plot_grid(res$src,res$trg)
+    ggsave(paste0('all_',metric,'_ridglines.pdf'),height = 12,width = 10)
+    return(res$src)
+})
+
+print(res)
+
+```
+
+## Figure 3A: performance on cell lines
+
+First result: evaluation on cell lines.
+
+```{r model performance, message = FALSE, warning = FALSE}
+#current list of ex vivo datasets. include liverpdo when complete
+
+
+ccdres <- subset(cdres,!trg %in% exvivo)
+
+ccres = lapply(metrics,function(x) {
+  res <- doFullPlot(x, ccdres)
+  cowplot::plot_grid(res$src,res$trg)
+  ggsave(paste0('celllines',metric,'_ridglines.pdf'),height = 8,width = 10)
+  return(res$src)
+})  
+
+print(ccres)
+
+```
+
+## Figure 3B - training/test set size
+
+## Compare dataset size to performance
+
+We wonder if the dataset size affects the predictive power.
+
+```{r dataset size}
+
+#number of combos
+combos = list(beataml = 3033,ccle = 10911,ctrpv2 = 303520,fimm = 2457 ,gcsi = 12320,
+             gdscv1 = 105808,gdscv2 = 45323,mpnst = 250, nci60 = 2317205,prism = 633169)
+
+numsamples = list()
+numdrugs = list()
+#todo: we can also evaluate number of samples or drugs
+
+#e can get performance summaries
+gres <- ccdres  |>
+  subset(model!='uno')|>
+  subset(met=='scc') |>
+  group_by(met,src,trg,model) |>
+  summarize(meanVal=mean(value)) |>
+  left_join(data.frame(src = names(combos),sampleNum = unlist(combos))) |>
+  arrange(meanVal)
+
+mom <- gres|>group_by(src,sampleNum)|>summarize(mv=mean(meanVal))|>arrange(mv)
+
+#gres <- subset(gres,met=='scc')
+gres$src = factor(gres$src,levels=unique(mom$src))
+gres |> 
+  ggplot(aes(x=src,y=meanVal,fill=model))+geom_boxplot()#+geom_jitter()
+
+
+```
+
+
+# Dataset prediction parsing
+Now we have to go into the individual predidictions to pull out trends
+
+## Figure 3
+
+What does this figure look like
+
+
+
diff --git a/manuscript/figure4ExVivoResults.Rmd b/manuscript/figure4ExVivoResults.Rmd
@@ -0,0 +1,53 @@
+---
+title: "Figure 4 ex vivo results"
+author: "Sara Gosline"
+date: "2025-06-02"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+Here we are focused on getting the details from a model prediction algorithm - where did it fail, where did it succeed?
+
+## Data and packages
+
+First we get the packages loaded and logged into synapse.
+```{r setup, include = FALSE}
+knitr::opts_chunk$set(echo  =  TRUE)
+library(tidyverse)
+library(ggplot2)
+source('coderdataResultsFunctions.R')
+
+```
+
+
+The data has been uploaded by natasha and can be downloaded as follows.
+
+```{r download data}
+
+  cdres <- getModelPerformanceData()
+
+  ecdres <- subset(cdres,trg %in% exvivo)
+
+
+```
+
+## Figure 4A
+
+
+```{r}
+exres = lapply(metrics,function(x) {
+  doFullPlot(x, ecdres,'cellline')
+  })
+
+print(exres)
+```
+## Create funtion to dive in
+
+
+
+```{r}
+
+```
diff --git a/manuscript/figure5MultiOmicsResults.Rmd b/manuscript/figure5MultiOmicsResults.Rmd
@@ -0,0 +1,18 @@
+---
+title: "Figure 5 protein comparison"
+output: html_notebook
+---
+
+The last figure is what we want to do to compare multiple omics measurements
+
+```{r}
+knitr::opts_chunk$set(echo  =  TRUE)
+library(tidyverse)
+library(ggplot2)
+source('coderdataResultsFunctions.R')
+
+```
+
+
+
+
diff --git a/manuscript/improveModelInterrogation.Rmd b/manuscript/improveModelInterrogation.Rmd
@@ -0,0 +1,43 @@
+---
+title: "Improve results interrogation"
+author: "Sara Gosline"
+date: "2025-06-02"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+Here we are focused on getting the details from a model prediction algorithm - where did it fail, where did it succeed?
+
+## Data and packages
+
+First we get the packages loaded and logged into synapse.
+```{r setup, include = FALSE}
+knitr::opts_chunk$set(echo  =  TRUE)
+library(tidyverse)
+library(synapser)
+library(ggplot2)
+library(ggridges)
+synapser::synLogin()
+
+```
+
+
+The data has been uploaded by natasha and can be downloaded as follows.
+
+```{r download data}
+
+
+```
+
+
+
+## Create funtion to dive in
+
+
+
+```{r}
+
+```
diff --git a/manuscript/improveResultVis.Rmd b/manuscript/improveResultVis.Rmd
diff --git a/manuscript/manuscript.Rproj b/manuscript/manuscript.Rproj