Skip to content

Commit dd395ac

Browse files
authored
Merge pull request #412 from PNNL-CompBio/manuscript-figure4
Parquet handling & Figure 4 draft based on MPNST data
2 parents b3ed31f + acd7ed6 commit dd395ac

2 files changed

Lines changed: 134 additions & 11 deletions

File tree

manuscript/coderdataResultsFunctions.R

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,15 @@
22

33
library(ggplot2)
44
library(dplyr)
5+
library(arrow)
56
library(ggridges)
67
library(synapser)
78
library(RColorBrewer)
89
##COLORS: standardize here
910

11+
models <- c('deepttc','graphdrp','lgbm','pathdsp','uno')
1012
modelcolors <- RColorBrewer::brewer.pal(n=6,name='RdYlBu')
11-
names(modelcolors) <- c('deepttc','graphdrp','lgbm','pathdsp','uno')
13+
names(modelcolors) <- models
1214

1315

1416
exvivo = c('mpnst','beataml','sarcpdo','pancpdo','bladderpdo','liverpdo')
@@ -67,18 +69,37 @@ getModelPerformanceData <- function(){
6769
}
6870

6971

70-
71-
###these files are very big so i'm not sure how to deal with them.
72+
# this currently only retrieves one dataset at a time and returns an appache
73+
# "arrow" tabular dataset object that can be interacted / queried via dplyr
7274
getModelPredictionData <- function(dset='lgbm') {
7375

74-
preds <- list(deepttc = 'syn68149793', graphdrp = 'syn68146828', lgbm = 'syn68149807', pathdsp = 'syn66772452', uno = 'syn68149809')
76+
preds <- list(
77+
deepttc = "syn68176968",
78+
graphdrp = "syn68176977",
79+
lgbm = "syn68176033",
80+
pathdsp = "syn68176970",
81+
uno = "syn68176971"
82+
)
7583

76-
fullres <- do.call(rbind,lapply(dset,function(mod)
77-
readr::read_csv(synapser::synGet(preds[[mod]])$path) |> mutate(model = mod)))
84+
dataset <- arrow::open_dataset(
85+
sources = synapser::synGet(preds[[dset]])$path,
86+
format = "parquet"
87+
)
7888

79-
return(preds)
89+
return(dataset)
8090
}
8191

92+
###these files are very big so i'm not sure how to deal with them.
93+
# getModelPredictionData <- function(dset='lgbm') {
94+
#
95+
# preds <- list(deepttc = 'syn68149793', graphdrp = 'syn68146828', lgbm = 'syn68149807', pathdsp = 'syn66772452', uno = 'syn68149809')
96+
#
97+
# fullres <- do.call(rbind,lapply(dset,function(mod)
98+
# readr::read_csv(synapser::synGet(preds[[mod]])$path) |> mutate(model = mod)))
99+
#
100+
# return(preds)
101+
# }
102+
82103
#this function plots a single metric by all the possible values
83104
#
84105
ridgelineMetricPlots <- function(metric,dataset=cdres, prefix='all'){

manuscript/figure4ExVivoResults.Rmd

Lines changed: 106 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,16 @@ First we get the packages loaded and logged into synapse.
1414
knitr::opts_chunk$set(echo = TRUE)
1515
library(tidyverse)
1616
library(ggplot2)
17+
library(arrow)
18+
library(dplyr)
19+
library(gridExtra)
1720
source('coderdataResultsFunctions.R')
18-
1921
```
2022

2123

2224
The data has been uploaded by natasha and can be downloaded as follows.
2325

24-
```{r download data}
26+
```{r download data, eval}
2527
2628
cdres <- getModelPerformanceData()
2729
@@ -33,7 +35,7 @@ The data has been uploaded by natasha and can be downloaded as follows.
3335
## Figure 4A
3436

3537

36-
```{r}
38+
```{r, eval=FALSE}
3739
metrics <- c('scc','pcc')
3840
exres = lapply(metrics,function(x) {
3941
res<-ridgelineMetricPlots(x, ecdres,'cellline')
@@ -47,7 +49,7 @@ print(exres)
4749

4850
Now we can confirm that the datasets follow the same pattern.
4951

50-
```{r dataset samples}
52+
```{r dataset samples, eval=FALSE}
5153
5254
5355
plot<-calcSourceStatistics('scc',ecdres)
@@ -61,7 +63,107 @@ print(plot)
6163
## Create funtion to dive in
6264

6365

66+
```{r}
67+
tgt = 'ccle'
68+
69+
all_preds <- do.call(
70+
rbind,
71+
lapply(
72+
models,
73+
function(mdl) getModelPredictionData(dset = mdl) |>
74+
dplyr::filter(target == tgt & source != tgt & source != 'beataml' & source != 'mpnst') |>
75+
collect()
76+
)
77+
)
78+
```
79+
```{r}
80+
plot_panel <- function(data, title){
81+
data <- sample_n(data, 10000)
82+
plot <- (
83+
ggplot(data, aes(x=auc_pred, y=auc_true))
84+
+ geom_point()
85+
+ geom_smooth(method=lm)
86+
+ facet_grid(source ~ model)
87+
+ ggtitle(title)
88+
# + xlim(0, 1)
89+
# + ylim(0, 1.25)
90+
)
91+
}
92+
```
93+
```{r}
94+
all_preds <- all_preds |> mutate(auc_ranges = cut(auc_true, c(-Inf, 0.2, 0.8, Inf), labels = c('auc_true <= 0.2', '0.2 < auc_true <= 0.8', 'auc_true > 0.8')))
6495
96+
```
6597
```{r}
98+
ranges <- list('auc_true <= 0.2', '0.2 < auc_true <= 0.8', 'auc_true > 0.8')
99+
plots <- lapply(ranges, function(auc_range){
100+
data <- all_preds |> filter(auc_ranges == auc_range) |> collect()
101+
plot_panel(data, auc_range)
102+
})
103+
plot <- arrangeGrob(grobs = plots, ncol = 3)
104+
ggsave('ccle_auc_plot.pdf', plot, dpi=300, width=30, height=10)
105+
```
106+
107+
Full model predictions are stored on synapse as parquet files. Individual
108+
datasets can be downloaded via `getModelPredictionData` in
109+
`coderdataResultsFunctions.R` (sources during the setup process).
110+
111+
Here we download data from all models, subset to only MPNST target predictions
112+
and combine the individual subsets into one master table.
113+
```{r predictions data import}
114+
115+
tgt = 'mpnst'
66116
117+
all_preds <- do.call(
118+
rbind,
119+
lapply(
120+
models,
121+
function(mdl) getModelPredictionData(dset = mdl) |>
122+
dplyr::filter(target == tgt & source != tgt & source != 'beataml') |>
123+
collect()
124+
)
125+
)
126+
127+
```
128+
129+
We want to group results by drugs i.e. create a panel per drug. To that end we
130+
extract the drugs and determine the "grid layout" by size of target drugs.
131+
```{r}
132+
133+
drugs <- unique(all_preds$improve_chem_id)
134+
grid_ncol = 4
135+
grid_nrow = ceiling(length(drugs) / grid_ncol)
136+
```
137+
138+
Defining the plot function for the individual "panels"
139+
```{r}
140+
plot_panel <- function(data, title){
141+
plot <- (
142+
ggplot(data, aes(x=auc_true, y=auc_pred))
143+
+ geom_point()
144+
+ geom_smooth(method=lm)
145+
+ facet_grid(source ~ model)
146+
+ ggtitle(title)
147+
+ xlim(0, 1)
148+
+ ylim(0, 1.25)
149+
)
150+
}
151+
152+
```
153+
154+
```{r}
155+
plots <- lapply(drugs, function(drug_id){
156+
data <- all_preds |> filter(improve_chem_id == drug_id) |> collect()
157+
plot_panel(data, drug_id)
158+
})
159+
```
160+
161+
```{r}
162+
plot <- arrangeGrob(grobs = plots, ncol = grid_ncol, nrow = grid_nrow)
163+
```
164+
165+
166+
```{r}
167+
# print(plot)
168+
ggsave('mpnst_auc_plot.pdf', plot, dpi=300, width=40, height=60, limitsize = FALSE)
67169
```

0 commit comments

Comments
 (0)