Skip to content

Commit 81ef02d

Browse files
committed
updated plot ideas for manuscript
1 parent 00438e8 commit 81ef02d

2 files changed

Lines changed: 157 additions & 124 deletions

File tree

manuscript/figure3CellLinePlots.Rmd

Lines changed: 157 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
---
22
title: "IMPROVE cell line results"
3-
author: "Sara gosline"
3+
author: "Sara Gosline and Yannick Mahlich"
44
date: "2025-03-27"
55
output: html_document
66
---
@@ -71,24 +71,172 @@ ggsave('cellLineSamplePerformanceCorrelation.pdf', plot,height=12,width=5)
7171
# Dataset prediction parsing
7272
Now we have to go into the individual predidictions to pull out trends
7373

74-
## Figure 3C
74+
## Figure 3C - AUC Study
7575

76-
First we can compare actual AUCs to predictive power
76+
First we can compare actual AUCs to predictive power. Does true AUC correlate with how well an algorithm works?
7777

78-
```{r auc calculation}
78+
### Multipanel correlation plots for CCLE predictions
7979

80-
##parse data, plot results
80+
First we evaluate how this hypothesis bears out in CCLE predictions.
81+
82+
Full model predictions are stored on synapse as parquet files. Individual
83+
datasets can be downloaded via `getModelPredictionData` in
84+
`coderdataResultsFunctions.R` (sources during the setup process).
85+
86+
```{r}
87+
tgt = 'ccle'
88+
89+
all_preds <- do.call(
90+
rbind,
91+
lapply(
92+
models,
93+
function(mdl) getModelPredictionData(dset = mdl) |>
94+
dplyr::filter(target == tgt & source != tgt & source != 'beataml' & source != 'mpnst') |>
95+
#dplyr::filter(source != tgt & source != 'beataml' & source != 'mpnst') |>
96+
collect()
97+
)
98+
)
99+
```
100+
101+
102+
```{r}
103+
plot_panel <- function(data, title){
104+
data <- sample_n(data, 10000)
105+
plot <- (
106+
ggplot(data, aes(x=auc_pred, y=auc_true))
107+
+ geom_point()
108+
+ geom_smooth(method=lm)
109+
+ facet_grid(source ~ model)
110+
+ ggtitle(title)
111+
# + xlim(0, 1)
112+
# + ylim(0, 1.25)
113+
)
114+
}
115+
```
116+
117+
We add more statistics to the data here
118+
119+
```{r}
120+
all_preds <- all_preds |>
121+
mutate(auc_ranges = cut(auc_true, c(-Inf, 0.25, 0.75, Inf), labels = c('auc_true <= 0.25', '0.25 < auc_true <= 0.75', 'auc_true > 0.75'))) |>
122+
mutate(diff = abs(auc_true - auc_pred)) |>
123+
mutate(norm_diff = diff/auc_true)
124+
125+
```
126+
127+
128+
129+
```{r}
130+
ranges <- list('auc_true <= 0.25', '0.25 < auc_true <= 0.75', 'auc_true > 0.75')
131+
plots <- lapply(ranges, function(auc_range){
132+
data <- all_preds |> filter(auc_ranges == auc_range) |> collect()
133+
plot_panel(data, auc_range)
134+
})
135+
plot <- arrangeGrob(grobs = plots, ncol = 3)
136+
ggsave('ccle_auc_plot.pdf', plot, dpi=300, width=30, height=10)
137+
138+
ggplot(all_preds,aes(x=diff,fill=model))+geom_histogram()+facet_grid(source~auc_ranges)+scale_y_log10()+scale_fill_manual(values=modelcolors)
81139
82140
```
83141

142+
### Summarize error and plot across all data
143+
144+
This is still too much data, since it only represents CCLE preditions, let's try to compute summaries independently and visualize those
145+
146+
```{r summaries}
84147
85-
## Figure 3D
148+
all_stats <- do.call(
149+
rbind,
150+
lapply(
151+
models,
152+
function(mdl) getModelPredictionData(dset = mdl) |>
153+
dplyr::select(auc_true,auc_pred,source,target,model) |> ##remove columsn to consume less memory
154+
dplyr::filter(!target %in% c('beataml','sarcpdo','pancpdo','mpnst','bladderpdo')) |>
155+
dplyr::filter(source != 'beataml' & source != 'mpnst') |>
156+
dplyr::filter(source != target) |>
157+
collect() |>
158+
mutate(diff = abs(auc_true - auc_pred)) |>
159+
mutate(auc_ranges = cut(auc_true, c(-Inf, 0.25, 0.75, Inf),
160+
labels = c('auc_true <= 0.25', '0.25 < auc_true <= 0.75', 'auc_true > 0.75'))) |>
161+
group_by(source, target, auc_ranges,model) |>
162+
summarize(`Median Difference` = median(diff))
163+
))
164+
165+
ggplot(all_stats,aes(x = `Median Difference`,fill = model)) + geom_histogram() + facet_grid(~auc_ranges)+scale_y_log10()+scale_fill_manual(values=modelcolors)
166+
167+
ggsave('medianDifferenceCellLine.pdf')
168+
169+
all_stats |>
170+
ggplot(aes(x = auc_ranges,y = `Median Difference`,fill = model)) +
171+
geom_bar(position = 'dodge',stat = 'identity') +
172+
facet_grid(target~source) +
173+
coord_flip() +
174+
scale_fill_manual(values = modelcolors)
175+
ggsave('medianDifferenceCellLine_byDataset.pdf')
176+
177+
178+
```
86179

87-
Compare drug sample performance
88180

89-
are there better performing drugs/samples?
181+
## Figure 3D - Drug panel
182+
183+
Now let's look across different drugs. Filter by drugs that show up in all models/datasets and then evaluate which perform best/worst.
184+
185+
```{r drugs}
186+
drug_meds <- do.call(
187+
rbind,
188+
lapply(
189+
models,
190+
function(mdl) getModelPredictionData(dset = mdl) |>
191+
dplyr::select(auc_true,auc_pred,improve_chem_id,source,target,model) |> ##remove columsn to consume less memory
192+
dplyr::filter(source != 'beataml' & source != 'mpnst') |>
193+
dplyr::filter(source != target)|>
194+
dplyr::filter(!target %in% c('beataml','sarcpdo','pancpdo','mpnst','bladderpdo')) |>
195+
#dplyr::filter(source != tgt & source != 'beataml' & source != 'mpnst') |>
196+
collect() |>
197+
mutate(diff = abs(auc_true - auc_pred)) |>
198+
mutate(norm_diff = diff/auc_true) |>
199+
group_by(source, target, improve_chem_id, model) |>
200+
summarize(`Median Difference` = median(diff))
201+
)
202+
)
203+
204+
drug_aucs <- do.call(
205+
rbind,
206+
lapply(
207+
models,
208+
function(mdl) getModelPredictionData(dset = mdl) |>
209+
dplyr::select(auc_true,improve_chem_id,source,target) |> ##remove columsn to consume less memory
210+
dplyr::filter(source != 'beataml' & source != 'mpnst') |>
211+
dplyr::filter(!target %in% c('beataml','sarcpdo','pancpdo','mpnst','bladderpdo')) |>
212+
#dplyr::filter(source != tgt & source != 'beataml' & source != 'mpnst') |>
213+
collect() |>
214+
distinct()
215+
)
216+
)
217+
218+
219+
220+
221+
##further filter the drugs to only includet hose that show up in all datasets
222+
dcounts <- drug_aucs|>group_by(source,target)|>summarize(drugs=n_distinct(improve_chem_id))|>arrange(drugs)
223+
224+
print(dcounts)
225+
##there are 23 drugs thats how up in all datasets
226+
mindrugs <- drug_aucs |>
227+
subset(source == 'ccle') |>
228+
subset(target == 'ccle') |>
229+
ungroup() |>
230+
select(improve_chem_id) |>
231+
distinct()
232+
233+
diff <- drug_meds |>
234+
subset(improve_chem_id %in% mindrugs$improve_chem_id)|>
235+
ggplot(aes(x=reorder(improve_chem_id,`Median Difference`),y=`Median Difference`,fill=model))+geom_boxplot()+scale_fill_manual(values=modelcolors) +
236+
coord_flip()
237+
238+
ggsave('diffBySharedDrugs.pdf',diff)
90239
91-
```{r sample/drug performance}
92240
93241
```
94242

manuscript/figure4ExVivoResults.Rmd

Lines changed: 0 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -60,121 +60,6 @@ print(plot)
6060
6161
```
6262

63-
## AUC Study
64-
65-
Does true AUC correlate with how well an algorithm works?
66-
67-
### Multipanel correlation plots
68-
69-
First we evaluate how this hypothesis bears out in CCLE predictions.
70-
71-
Full model predictions are stored on synapse as parquet files. Individual
72-
datasets can be downloaded via `getModelPredictionData` in
73-
`coderdataResultsFunctions.R` (sources during the setup process).
74-
75-
```{r}
76-
tgt = 'ccle'
77-
78-
all_preds <- do.call(
79-
rbind,
80-
lapply(
81-
models,
82-
function(mdl) getModelPredictionData(dset = mdl) |>
83-
dplyr::filter(target == tgt & source != tgt & source != 'beataml' & source != 'mpnst') |>
84-
#dplyr::filter(source != tgt & source != 'beataml' & source != 'mpnst') |>
85-
collect()
86-
)
87-
)
88-
```
89-
90-
91-
```{r}
92-
plot_panel <- function(data, title){
93-
data <- sample_n(data, 10000)
94-
plot <- (
95-
ggplot(data, aes(x=auc_pred, y=auc_true))
96-
+ geom_point()
97-
+ geom_smooth(method=lm)
98-
+ facet_grid(source ~ model)
99-
+ ggtitle(title)
100-
# + xlim(0, 1)
101-
# + ylim(0, 1.25)
102-
)
103-
}
104-
```
105-
106-
We add more statistics to the data here
107-
108-
```{r}
109-
all_preds <- all_preds |>
110-
mutate(auc_ranges = cut(auc_true, c(-Inf, 0.25, 0.75, Inf), labels = c('auc_true <= 0.25', '0.25 < auc_true <= 0.75', 'auc_true > 0.75'))) |>
111-
mutate(diff = abs(auc_true - auc_pred)) |>
112-
mutate(norm_diff = diff/auc_true)
113-
114-
```
115-
116-
117-
118-
```{r}
119-
ranges <- list('auc_true <= 0.25', '0.25 < auc_true <= 0.75', 'auc_true > 0.75')
120-
plots <- lapply(ranges, function(auc_range){
121-
data <- all_preds |> filter(auc_ranges == auc_range) |> collect()
122-
plot_panel(data, auc_range)
123-
})
124-
plot <- arrangeGrob(grobs = plots, ncol = 3)
125-
ggsave('ccle_auc_plot.pdf', plot, dpi=300, width=30, height=10)
126-
```
127-
128-
### Summarize error and plot across all data
129-
130-
First lets see the over all distribution
131-
```{r error summary ploting}
132-
133-
ggplot(all_preds,aes(x=norm_diff,fill=model))+geom_histogram()+facet_grid(source~auc_ranges)+scale_y_log10()+scale_fill_manual(values=modelcolors)
134-
```
135-
136-
This is still too much data, since it only represents CCLE preditions, let's try to compute summaries independetly.
137-
138-
```{r summaries}
139-
140-
all_summaries <- do.call(
141-
rbind,
142-
lapply(
143-
models,
144-
function(mdl) getModelPredictionData(dset = mdl) |>
145-
dplyr::select(auc_true,auc_pred,source,target,model) |> ##remove columsn to consume less memory
146-
dplyr::filter(source != 'beataml' & source != 'mpnst') |>
147-
dplyr::filter(!target %in% c('beataml','sarcpdo','pancpdo','mpnst','bladderpdo')) |>
148-
#dplyr::filter(source != tgt & source != 'beataml' & source != 'mpnst') |>
149-
collect() |>
150-
mutate(auc_ranges = cut(auc_true, c(-Inf, 0.25, 0.75, Inf), labels = c('auc_true <= 0.25', '0.25 < auc_true <= 0.75', 'auc_true > 0.75'))) |>
151-
mutate(diff = abs(auc_true - auc_pred)) |>
152-
mutate(norm_diff = diff/auc_true) |>
153-
group_by(source, target, auc_ranges,model) |>
154-
summarize(`Median Difference` = median(diff))
155-
)
156-
)
157-
158-
all_summaries |>
159-
ggplot(aes(x = auc_ranges,y = `Median Difference`,fill = model)) +
160-
geom_bar(position = 'dodge',stat = 'identity') +
161-
facet_grid(target~source) +
162-
coord_flip() +
163-
scale_fill_manual(values=modelcolors)
164-
165-
ggsave('medianDifferenceCellLine.pdf')
166-
167-
```
168-
169-
170-
## Drug panel
171-
172-
Now let's look across different drugs. Filter by drugs that show up in all models/datasets and then evaluate which perform best/worst.
173-
174-
```{r drugs}
175-
176-
```
177-
17863
## MPNST test
17964

18065
Here we download data from all models, subset to only MPNST target predictions

0 commit comments

Comments
 (0)