|
1 | 1 | --- |
2 | 2 | title: "IMPROVE cell line results" |
3 | | -author: "Sara gosline" |
| 3 | +author: "Sara Gosline and Yannick Mahlich" |
4 | 4 | date: "2025-03-27" |
5 | 5 | output: html_document |
6 | 6 | --- |
@@ -71,24 +71,172 @@ ggsave('cellLineSamplePerformanceCorrelation.pdf', plot,height=12,width=5) |
71 | 71 | # Dataset prediction parsing |
72 | 72 | Now we have to go into the individual predidictions to pull out trends |
73 | 73 |
|
74 | | -## Figure 3C |
| 74 | +## Figure 3C - AUC Study |
75 | 75 |
|
76 | | -First we can compare actual AUCs to predictive power |
| 76 | +First we can compare actual AUCs to predictive power. Does true AUC correlate with how well an algorithm works? |
77 | 77 |
|
78 | | -```{r auc calculation} |
| 78 | +### Multipanel correlation plots for CCLE predictions |
79 | 79 |
|
80 | | -##parse data, plot results |
| 80 | +First we evaluate how this hypothesis bears out in CCLE predictions. |
| 81 | + |
| 82 | +Full model predictions are stored on synapse as parquet files. Individual |
| 83 | +datasets can be downloaded via `getModelPredictionData` in |
| 84 | +`coderdataResultsFunctions.R` (sources during the setup process). |
| 85 | + |
| 86 | +```{r} |
| 87 | +tgt = 'ccle' |
| 88 | +
|
| 89 | +all_preds <- do.call( |
| 90 | + rbind, |
| 91 | + lapply( |
| 92 | + models, |
| 93 | + function(mdl) getModelPredictionData(dset = mdl) |> |
| 94 | + dplyr::filter(target == tgt & source != tgt & source != 'beataml' & source != 'mpnst') |> |
| 95 | + #dplyr::filter(source != tgt & source != 'beataml' & source != 'mpnst') |> |
| 96 | + collect() |
| 97 | + ) |
| 98 | + ) |
| 99 | +``` |
| 100 | + |
| 101 | + |
| 102 | +```{r} |
| 103 | +plot_panel <- function(data, title){ |
| 104 | + data <- sample_n(data, 10000) |
| 105 | + plot <- ( |
| 106 | + ggplot(data, aes(x=auc_pred, y=auc_true)) |
| 107 | + + geom_point() |
| 108 | + + geom_smooth(method=lm) |
| 109 | + + facet_grid(source ~ model) |
| 110 | + + ggtitle(title) |
| 111 | + # + xlim(0, 1) |
| 112 | + # + ylim(0, 1.25) |
| 113 | + ) |
| 114 | +} |
| 115 | +``` |
| 116 | + |
| 117 | +We add more statistics to the data here |
| 118 | + |
| 119 | +```{r} |
| 120 | +all_preds <- all_preds |> |
| 121 | + mutate(auc_ranges = cut(auc_true, c(-Inf, 0.25, 0.75, Inf), labels = c('auc_true <= 0.25', '0.25 < auc_true <= 0.75', 'auc_true > 0.75'))) |> |
| 122 | + mutate(diff = abs(auc_true - auc_pred)) |> |
| 123 | + mutate(norm_diff = diff/auc_true) |
| 124 | +
|
| 125 | +``` |
| 126 | + |
| 127 | + |
| 128 | + |
| 129 | +```{r} |
| 130 | +ranges <- list('auc_true <= 0.25', '0.25 < auc_true <= 0.75', 'auc_true > 0.75') |
| 131 | +plots <- lapply(ranges, function(auc_range){ |
| 132 | + data <- all_preds |> filter(auc_ranges == auc_range) |> collect() |
| 133 | + plot_panel(data, auc_range) |
| 134 | +}) |
| 135 | +plot <- arrangeGrob(grobs = plots, ncol = 3) |
| 136 | +ggsave('ccle_auc_plot.pdf', plot, dpi=300, width=30, height=10) |
| 137 | +
|
| 138 | +ggplot(all_preds,aes(x=diff,fill=model))+geom_histogram()+facet_grid(source~auc_ranges)+scale_y_log10()+scale_fill_manual(values=modelcolors) |
81 | 139 |
|
82 | 140 | ``` |
83 | 141 |
|
| 142 | +### Summarize error and plot across all data |
| 143 | + |
| 144 | +This is still too much data, since it only represents CCLE preditions, let's try to compute summaries independently and visualize those |
| 145 | + |
| 146 | +```{r summaries} |
84 | 147 |
|
85 | | -## Figure 3D |
| 148 | +all_stats <- do.call( |
| 149 | + rbind, |
| 150 | + lapply( |
| 151 | + models, |
| 152 | + function(mdl) getModelPredictionData(dset = mdl) |> |
| 153 | + dplyr::select(auc_true,auc_pred,source,target,model) |> ##remove columsn to consume less memory |
| 154 | + dplyr::filter(!target %in% c('beataml','sarcpdo','pancpdo','mpnst','bladderpdo')) |> |
| 155 | + dplyr::filter(source != 'beataml' & source != 'mpnst') |> |
| 156 | + dplyr::filter(source != target) |> |
| 157 | + collect() |> |
| 158 | + mutate(diff = abs(auc_true - auc_pred)) |> |
| 159 | + mutate(auc_ranges = cut(auc_true, c(-Inf, 0.25, 0.75, Inf), |
| 160 | + labels = c('auc_true <= 0.25', '0.25 < auc_true <= 0.75', 'auc_true > 0.75'))) |> |
| 161 | + group_by(source, target, auc_ranges,model) |> |
| 162 | + summarize(`Median Difference` = median(diff)) |
| 163 | + )) |
| 164 | +
|
| 165 | +ggplot(all_stats,aes(x = `Median Difference`,fill = model)) + geom_histogram() + facet_grid(~auc_ranges)+scale_y_log10()+scale_fill_manual(values=modelcolors) |
| 166 | +
|
| 167 | +ggsave('medianDifferenceCellLine.pdf') |
| 168 | +
|
| 169 | +all_stats |> |
| 170 | + ggplot(aes(x = auc_ranges,y = `Median Difference`,fill = model)) + |
| 171 | + geom_bar(position = 'dodge',stat = 'identity') + |
| 172 | + facet_grid(target~source) + |
| 173 | + coord_flip() + |
| 174 | + scale_fill_manual(values = modelcolors) |
| 175 | +ggsave('medianDifferenceCellLine_byDataset.pdf') |
| 176 | +
|
| 177 | +
|
| 178 | +``` |
86 | 179 |
|
87 | | -Compare drug sample performance |
88 | 180 |
|
89 | | -are there better performing drugs/samples? |
| 181 | +## Figure 3D - Drug panel |
| 182 | + |
| 183 | +Now let's look across different drugs. Filter by drugs that show up in all models/datasets and then evaluate which perform best/worst. |
| 184 | + |
| 185 | +```{r drugs} |
| 186 | +drug_meds <- do.call( |
| 187 | + rbind, |
| 188 | + lapply( |
| 189 | + models, |
| 190 | + function(mdl) getModelPredictionData(dset = mdl) |> |
| 191 | + dplyr::select(auc_true,auc_pred,improve_chem_id,source,target,model) |> ##remove columsn to consume less memory |
| 192 | + dplyr::filter(source != 'beataml' & source != 'mpnst') |> |
| 193 | + dplyr::filter(source != target)|> |
| 194 | + dplyr::filter(!target %in% c('beataml','sarcpdo','pancpdo','mpnst','bladderpdo')) |> |
| 195 | + #dplyr::filter(source != tgt & source != 'beataml' & source != 'mpnst') |> |
| 196 | + collect() |> |
| 197 | + mutate(diff = abs(auc_true - auc_pred)) |> |
| 198 | + mutate(norm_diff = diff/auc_true) |> |
| 199 | + group_by(source, target, improve_chem_id, model) |> |
| 200 | + summarize(`Median Difference` = median(diff)) |
| 201 | + ) |
| 202 | + ) |
| 203 | +
|
| 204 | +drug_aucs <- do.call( |
| 205 | + rbind, |
| 206 | + lapply( |
| 207 | + models, |
| 208 | + function(mdl) getModelPredictionData(dset = mdl) |> |
| 209 | + dplyr::select(auc_true,improve_chem_id,source,target) |> ##remove columsn to consume less memory |
| 210 | + dplyr::filter(source != 'beataml' & source != 'mpnst') |> |
| 211 | + dplyr::filter(!target %in% c('beataml','sarcpdo','pancpdo','mpnst','bladderpdo')) |> |
| 212 | + #dplyr::filter(source != tgt & source != 'beataml' & source != 'mpnst') |> |
| 213 | + collect() |> |
| 214 | + distinct() |
| 215 | + ) |
| 216 | + ) |
| 217 | +
|
| 218 | +
|
| 219 | +
|
| 220 | +
|
| 221 | +##further filter the drugs to only includet hose that show up in all datasets |
| 222 | +dcounts <- drug_aucs|>group_by(source,target)|>summarize(drugs=n_distinct(improve_chem_id))|>arrange(drugs) |
| 223 | +
|
| 224 | +print(dcounts) |
| 225 | +##there are 23 drugs thats how up in all datasets |
| 226 | +mindrugs <- drug_aucs |> |
| 227 | + subset(source == 'ccle') |> |
| 228 | + subset(target == 'ccle') |> |
| 229 | + ungroup() |> |
| 230 | + select(improve_chem_id) |> |
| 231 | + distinct() |
| 232 | +
|
| 233 | +diff <- drug_meds |> |
| 234 | + subset(improve_chem_id %in% mindrugs$improve_chem_id)|> |
| 235 | + ggplot(aes(x=reorder(improve_chem_id,`Median Difference`),y=`Median Difference`,fill=model))+geom_boxplot()+scale_fill_manual(values=modelcolors) + |
| 236 | + coord_flip() |
| 237 | +
|
| 238 | +ggsave('diffBySharedDrugs.pdf',diff) |
90 | 239 |
|
91 | | -```{r sample/drug performance} |
92 | 240 |
|
93 | 241 | ``` |
94 | 242 |
|
|
0 commit comments