Skip to content

Commit 18361db

Browse files
authored
Merge pull request #414 from PNNL-CompBio/figure3-updates
Figure3 updates
2 parents dd395ac + 81ef02d commit 18361db

3 files changed

Lines changed: 160 additions & 58 deletions

File tree

manuscript/coderdataResultsFunctions.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ names(ecols) <- exvivo
2424

2525
datasetcolors <- c(ccols,ecols)
2626

27-
synapser::synLogin()
27+
syn <- synapser::synLogin()
2828

2929
getProteomicsData <- function(){
3030

manuscript/figure3CellLinePlots.Rmd

Lines changed: 157 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
---
22
title: "IMPROVE cell line results"
3-
author: "Sara gosline"
3+
author: "Sara Gosline and Yannick Mahlich"
44
date: "2025-03-27"
55
output: html_document
66
---
@@ -71,24 +71,172 @@ ggsave('cellLineSamplePerformanceCorrelation.pdf', plot,height=12,width=5)
7171
# Dataset prediction parsing
7272
Now we have to go into the individual predidictions to pull out trends
7373

74-
## Figure 3C
74+
## Figure 3C - AUC Study
7575

76-
First we can compare actual AUCs to predictive power
76+
First we can compare actual AUCs to predictive power. Does true AUC correlate with how well an algorithm works?
7777

78-
```{r auc calculation}
78+
### Multipanel correlation plots for CCLE predictions
7979

80-
##parse data, plot results
80+
First we evaluate how this hypothesis bears out in CCLE predictions.
81+
82+
Full model predictions are stored on synapse as parquet files. Individual
83+
datasets can be downloaded via `getModelPredictionData` in
84+
`coderdataResultsFunctions.R` (sources during the setup process).
85+
86+
```{r}
87+
tgt = 'ccle'
88+
89+
all_preds <- do.call(
90+
rbind,
91+
lapply(
92+
models,
93+
function(mdl) getModelPredictionData(dset = mdl) |>
94+
dplyr::filter(target == tgt & source != tgt & source != 'beataml' & source != 'mpnst') |>
95+
#dplyr::filter(source != tgt & source != 'beataml' & source != 'mpnst') |>
96+
collect()
97+
)
98+
)
99+
```
100+
101+
102+
```{r}
103+
plot_panel <- function(data, title){
104+
data <- sample_n(data, 10000)
105+
plot <- (
106+
ggplot(data, aes(x=auc_pred, y=auc_true))
107+
+ geom_point()
108+
+ geom_smooth(method=lm)
109+
+ facet_grid(source ~ model)
110+
+ ggtitle(title)
111+
# + xlim(0, 1)
112+
# + ylim(0, 1.25)
113+
)
114+
}
115+
```
116+
117+
We add more statistics to the data here
118+
119+
```{r}
120+
all_preds <- all_preds |>
121+
mutate(auc_ranges = cut(auc_true, c(-Inf, 0.25, 0.75, Inf), labels = c('auc_true <= 0.25', '0.25 < auc_true <= 0.75', 'auc_true > 0.75'))) |>
122+
mutate(diff = abs(auc_true - auc_pred)) |>
123+
mutate(norm_diff = diff/auc_true)
124+
125+
```
126+
127+
128+
129+
```{r}
130+
ranges <- list('auc_true <= 0.25', '0.25 < auc_true <= 0.75', 'auc_true > 0.75')
131+
plots <- lapply(ranges, function(auc_range){
132+
data <- all_preds |> filter(auc_ranges == auc_range) |> collect()
133+
plot_panel(data, auc_range)
134+
})
135+
plot <- arrangeGrob(grobs = plots, ncol = 3)
136+
ggsave('ccle_auc_plot.pdf', plot, dpi=300, width=30, height=10)
137+
138+
ggplot(all_preds,aes(x=diff,fill=model))+geom_histogram()+facet_grid(source~auc_ranges)+scale_y_log10()+scale_fill_manual(values=modelcolors)
81139
82140
```
83141

142+
### Summarize error and plot across all data
143+
144+
This is still too much data, since it only represents CCLE preditions, let's try to compute summaries independently and visualize those
145+
146+
```{r summaries}
84147
85-
## Figure 3D
148+
all_stats <- do.call(
149+
rbind,
150+
lapply(
151+
models,
152+
function(mdl) getModelPredictionData(dset = mdl) |>
153+
dplyr::select(auc_true,auc_pred,source,target,model) |> ##remove columsn to consume less memory
154+
dplyr::filter(!target %in% c('beataml','sarcpdo','pancpdo','mpnst','bladderpdo')) |>
155+
dplyr::filter(source != 'beataml' & source != 'mpnst') |>
156+
dplyr::filter(source != target) |>
157+
collect() |>
158+
mutate(diff = abs(auc_true - auc_pred)) |>
159+
mutate(auc_ranges = cut(auc_true, c(-Inf, 0.25, 0.75, Inf),
160+
labels = c('auc_true <= 0.25', '0.25 < auc_true <= 0.75', 'auc_true > 0.75'))) |>
161+
group_by(source, target, auc_ranges,model) |>
162+
summarize(`Median Difference` = median(diff))
163+
))
164+
165+
ggplot(all_stats,aes(x = `Median Difference`,fill = model)) + geom_histogram() + facet_grid(~auc_ranges)+scale_y_log10()+scale_fill_manual(values=modelcolors)
166+
167+
ggsave('medianDifferenceCellLine.pdf')
168+
169+
all_stats |>
170+
ggplot(aes(x = auc_ranges,y = `Median Difference`,fill = model)) +
171+
geom_bar(position = 'dodge',stat = 'identity') +
172+
facet_grid(target~source) +
173+
coord_flip() +
174+
scale_fill_manual(values = modelcolors)
175+
ggsave('medianDifferenceCellLine_byDataset.pdf')
176+
177+
178+
```
86179

87-
Compare drug sample performance
88180

89-
are there better performing drugs/samples?
181+
## Figure 3D - Drug panel
182+
183+
Now let's look across different drugs. Filter by drugs that show up in all models/datasets and then evaluate which perform best/worst.
184+
185+
```{r drugs}
186+
drug_meds <- do.call(
187+
rbind,
188+
lapply(
189+
models,
190+
function(mdl) getModelPredictionData(dset = mdl) |>
191+
dplyr::select(auc_true,auc_pred,improve_chem_id,source,target,model) |> ##remove columsn to consume less memory
192+
dplyr::filter(source != 'beataml' & source != 'mpnst') |>
193+
dplyr::filter(source != target)|>
194+
dplyr::filter(!target %in% c('beataml','sarcpdo','pancpdo','mpnst','bladderpdo')) |>
195+
#dplyr::filter(source != tgt & source != 'beataml' & source != 'mpnst') |>
196+
collect() |>
197+
mutate(diff = abs(auc_true - auc_pred)) |>
198+
mutate(norm_diff = diff/auc_true) |>
199+
group_by(source, target, improve_chem_id, model) |>
200+
summarize(`Median Difference` = median(diff))
201+
)
202+
)
203+
204+
drug_aucs <- do.call(
205+
rbind,
206+
lapply(
207+
models,
208+
function(mdl) getModelPredictionData(dset = mdl) |>
209+
dplyr::select(auc_true,improve_chem_id,source,target) |> ##remove columsn to consume less memory
210+
dplyr::filter(source != 'beataml' & source != 'mpnst') |>
211+
dplyr::filter(!target %in% c('beataml','sarcpdo','pancpdo','mpnst','bladderpdo')) |>
212+
#dplyr::filter(source != tgt & source != 'beataml' & source != 'mpnst') |>
213+
collect() |>
214+
distinct()
215+
)
216+
)
217+
218+
219+
220+
221+
##further filter the drugs to only includet hose that show up in all datasets
222+
dcounts <- drug_aucs|>group_by(source,target)|>summarize(drugs=n_distinct(improve_chem_id))|>arrange(drugs)
223+
224+
print(dcounts)
225+
##there are 23 drugs thats how up in all datasets
226+
mindrugs <- drug_aucs |>
227+
subset(source == 'ccle') |>
228+
subset(target == 'ccle') |>
229+
ungroup() |>
230+
select(improve_chem_id) |>
231+
distinct()
232+
233+
diff <- drug_meds |>
234+
subset(improve_chem_id %in% mindrugs$improve_chem_id)|>
235+
ggplot(aes(x=reorder(improve_chem_id,`Median Difference`),y=`Median Difference`,fill=model))+geom_boxplot()+scale_fill_manual(values=modelcolors) +
236+
coord_flip()
237+
238+
ggsave('diffBySharedDrugs.pdf',diff)
90239
91-
```{r sample/drug performance}
92240
93241
```
94242

manuscript/figure4ExVivoResults.Rmd

Lines changed: 2 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
---
22
title: "Figure 4 ex vivo results"
3-
author: "Sara Gosline"
3+
author: "Sara Gosline and Yannick Mahlich"
44
date: "2025-06-02"
55
output: html_document
66
---
@@ -60,53 +60,7 @@ print(plot)
6060
6161
```
6262

63-
## Create funtion to dive in
64-
65-
66-
```{r}
67-
tgt = 'ccle'
68-
69-
all_preds <- do.call(
70-
rbind,
71-
lapply(
72-
models,
73-
function(mdl) getModelPredictionData(dset = mdl) |>
74-
dplyr::filter(target == tgt & source != tgt & source != 'beataml' & source != 'mpnst') |>
75-
collect()
76-
)
77-
)
78-
```
79-
```{r}
80-
plot_panel <- function(data, title){
81-
data <- sample_n(data, 10000)
82-
plot <- (
83-
ggplot(data, aes(x=auc_pred, y=auc_true))
84-
+ geom_point()
85-
+ geom_smooth(method=lm)
86-
+ facet_grid(source ~ model)
87-
+ ggtitle(title)
88-
# + xlim(0, 1)
89-
# + ylim(0, 1.25)
90-
)
91-
}
92-
```
93-
```{r}
94-
all_preds <- all_preds |> mutate(auc_ranges = cut(auc_true, c(-Inf, 0.2, 0.8, Inf), labels = c('auc_true <= 0.2', '0.2 < auc_true <= 0.8', 'auc_true > 0.8')))
95-
96-
```
97-
```{r}
98-
ranges <- list('auc_true <= 0.2', '0.2 < auc_true <= 0.8', 'auc_true > 0.8')
99-
plots <- lapply(ranges, function(auc_range){
100-
data <- all_preds |> filter(auc_ranges == auc_range) |> collect()
101-
plot_panel(data, auc_range)
102-
})
103-
plot <- arrangeGrob(grobs = plots, ncol = 3)
104-
ggsave('ccle_auc_plot.pdf', plot, dpi=300, width=30, height=10)
105-
```
106-
107-
Full model predictions are stored on synapse as parquet files. Individual
108-
datasets can be downloaded via `getModelPredictionData` in
109-
`coderdataResultsFunctions.R` (sources during the setup process).
63+
## MPNST test
11064

11165
Here we download data from all models, subset to only MPNST target predictions
11266
and combine the individual subsets into one master table.

0 commit comments

Comments
 (0)