11---
22title : " Figure 4 ex vivo results"
3- author : " Sara Gosline"
3+ author : " Sara Gosline and Yannick Mahlich "
44date : " 2025-06-02"
55output : html_document
66---
@@ -60,8 +60,17 @@ print(plot)
6060
6161```
6262
63- ## Create funtion to dive in
63+ ## AUC Study
6464
65+ Does true AUC correlate with how well an algorithm works?
66+
67+ ### Multipanel correlation plots
68+
69+ First we evaluate how this hypothesis bears out in CCLE predictions.
70+
71+ Full model predictions are stored on synapse as parquet files. Individual
72+ datasets can be downloaded via ` getModelPredictionData ` in
73+ ` coderdataResultsFunctions.R ` (sources during the setup process).
6574
6675``` {r}
6776tgt = 'ccle'
@@ -72,10 +81,13 @@ all_preds <- do.call(
7281 models,
7382 function(mdl) getModelPredictionData(dset = mdl) |>
7483 dplyr::filter(target == tgt & source != tgt & source != 'beataml' & source != 'mpnst') |>
84+ #dplyr::filter(source != tgt & source != 'beataml' & source != 'mpnst') |>
7585 collect()
7686 )
7787 )
7888```
89+
90+
7991``` {r}
8092plot_panel <- function(data, title){
8193 data <- sample_n(data, 10000)
@@ -90,12 +102,21 @@ plot_panel <- function(data, title){
90102 )
91103}
92104```
105+
106+ We add more statistics to the data here
107+
93108``` {r}
94- all_preds <- all_preds |> mutate(auc_ranges = cut(auc_true, c(-Inf, 0.2, 0.8, Inf), labels = c('auc_true <= 0.2', '0.2 < auc_true <= 0.8', 'auc_true > 0.8')))
109+ all_preds <- all_preds |>
110+ mutate(auc_ranges = cut(auc_true, c(-Inf, 0.25, 0.75, Inf), labels = c('auc_true <= 0.25', '0.25 < auc_true <= 0.75', 'auc_true > 0.75'))) |>
111+ mutate(diff = abs(auc_true - auc_pred)) |>
112+ mutate(norm_diff = diff/auc_true)
95113
96114```
115+
116+
117+
97118``` {r}
98- ranges <- list('auc_true <= 0.2 ', '0.2 < auc_true <= 0.8 ', 'auc_true > 0.8 ')
119+ ranges <- list('auc_true <= 0.25 ', '0.25 < auc_true <= 0.75 ', 'auc_true > 0.75 ')
99120plots <- lapply(ranges, function(auc_range){
100121 data <- all_preds |> filter(auc_ranges == auc_range) |> collect()
101122 plot_panel(data, auc_range)
@@ -104,9 +125,57 @@ plot <- arrangeGrob(grobs = plots, ncol = 3)
104125ggsave('ccle_auc_plot.pdf', plot, dpi=300, width=30, height=10)
105126```
106127
107- Full model predictions are stored on synapse as parquet files. Individual
108- datasets can be downloaded via ` getModelPredictionData ` in
109- ` coderdataResultsFunctions.R ` (sources during the setup process).
128+ ### Summarize error and plot across all data
129+
130+ First lets see the over all distribution
131+ ``` {r error summary ploting}
132+
133+ ggplot(all_preds,aes(x=norm_diff,fill=model))+geom_histogram()+facet_grid(source~auc_ranges)+scale_y_log10()+scale_fill_manual(values=modelcolors)
134+ ```
135+
136+ This is still too much data, since it only represents CCLE preditions, let's try to compute summaries independetly.
137+
138+ ``` {r summaries}
139+
140+ all_summaries <- do.call(
141+ rbind,
142+ lapply(
143+ models,
144+ function(mdl) getModelPredictionData(dset = mdl) |>
145+ dplyr::select(auc_true,auc_pred,source,target,model) |> ##remove columsn to consume less memory
146+ dplyr::filter(source != 'beataml' & source != 'mpnst') |>
147+ dplyr::filter(!target %in% c('beataml','sarcpdo','pancpdo','mpnst','bladderpdo')) |>
148+ #dplyr::filter(source != tgt & source != 'beataml' & source != 'mpnst') |>
149+ collect() |>
150+ mutate(auc_ranges = cut(auc_true, c(-Inf, 0.25, 0.75, Inf), labels = c('auc_true <= 0.25', '0.25 < auc_true <= 0.75', 'auc_true > 0.75'))) |>
151+ mutate(diff = abs(auc_true - auc_pred)) |>
152+ mutate(norm_diff = diff/auc_true) |>
153+ group_by(source, target, auc_ranges,model) |>
154+ summarize(`Median Difference` = median(diff))
155+ )
156+ )
157+
158+ all_summaries |>
159+ ggplot(aes(x = auc_ranges,y = `Median Difference`,fill = model)) +
160+ geom_bar(position = 'dodge',stat = 'identity') +
161+ facet_grid(target~source) +
162+ coord_flip() +
163+ scale_fill_manual(values=modelcolors)
164+
165+ ggsave('medianDifferenceCellLine.pdf')
166+
167+ ```
168+
169+
170+ ## Drug panel
171+
172+ Now let's look across different drugs. Filter by drugs that show up in all models/datasets and then evaluate which perform best/worst.
173+
174+ ``` {r drugs}
175+
176+ ```
177+
178+ ## MPNST test
110179
111180Here we download data from all models, subset to only MPNST target predictions
112181and combine the individual subsets into one master table.
0 commit comments