|
| 1 | +#!/usr/bin/env Rscript |
| 2 | + |
| 3 | +# Combined MPNST & MPNST-PDX Data Extraction Script |
| 4 | +# This script unifies data extraction for PDX, Tumor, and Xenograft-Derived Organoid samples. |
| 5 | + |
| 6 | +# Load required libraries |
| 7 | +library(data.table) |
| 8 | +library(synapser) |
| 9 | +library(dplyr) |
| 10 | +library(tidyr) |
| 11 | + |
| 12 | +# Retrieve command line arguments |
| 13 | +args <- commandArgs(trailingOnly = TRUE) |
| 14 | +if (length(args) < 3) { |
| 15 | + stop("Usage: Rscript 01_combined_omics.R <PAT> <samples.csv> <genes.csv>", call. = FALSE) |
| 16 | +} |
| 17 | +PAT <- args[1] |
| 18 | +samples <- args[2] |
| 19 | +genes <- args[3] |
| 20 | + |
| 21 | +# Log in to Synapse |
| 22 | +token <- PAT |
| 23 | +synLogin(authToken = token) |
| 24 | + |
| 25 | +# Read sample mapping and gene mapping |
| 26 | +samples_df <- fread(samples) %>% |
| 27 | + select(improve_sample_id, common_name, model_type) %>% |
| 28 | + distinct() |
| 29 | +genes_df <- fread(genes) |
| 30 | + |
| 31 | +# Subset by model type |
| 32 | +pdx_samps <- filter(samples_df, model_type == "patient derived xenograft") |
| 33 | +tumor_samps<- filter(samples_df, model_type == "tumor") |
| 34 | +mt_samps <- filter(samples_df, model_type == "xenograft derived organoid") # These end up being the same as pdx_samps in the manifest. |
| 35 | + |
| 36 | +# Retrieve manifest table from Synapse |
| 37 | +manifest <- synTableQuery("select * from syn53503360")$asDataFrame() %>% |
| 38 | + rename(common_name = Sample) |
| 39 | + |
| 40 | +# Build sample tables |
| 41 | +pdx_data <- manifest %>% |
| 42 | + select(common_name, starts_with("PDX")) %>% |
| 43 | + left_join(pdx_samps, by = "common_name") %>% |
| 44 | + select(improve_sample_id, common_name, model_type, |
| 45 | + RNASeq = PDX_RNASeq, |
| 46 | + Mutations = PDX_Somatic_Mutations, |
| 47 | + CopyNumber = PDX_CNV, |
| 48 | + Proteomics = PDX_Proteomics) %>% |
| 49 | + filter(!is.na(improve_sample_id)) |
| 50 | + |
| 51 | +tumor_data <- manifest %>% |
| 52 | + select(common_name, starts_with("Tumor")) %>% |
| 53 | + left_join(tumor_samps, by = "common_name") %>% |
| 54 | + select(improve_sample_id, common_name, model_type, |
| 55 | + RNASeq = Tumor_RNASeq, |
| 56 | + Mutations = Tumor_Somatic_Mutations, |
| 57 | + CopyNumber = Tumor_CNV) %>% |
| 58 | + mutate(Proteomics = "") %>% |
| 59 | + filter(!is.na(improve_sample_id)) |
| 60 | + |
| 61 | +mt_data <- manifest %>% #Note, this is the same as pdx_data but I think we default to "xenograft derived organoid" if present (based on original files) |
| 62 | + select(common_name, starts_with("PDX")) %>% |
| 63 | + left_join(mt_samps, by = "common_name") %>% |
| 64 | + select(improve_sample_id, common_name, model_type, |
| 65 | + RNASeq = PDX_RNASeq, |
| 66 | + Mutations = PDX_Somatic_Mutations, |
| 67 | + CopyNumber = PDX_CNV, |
| 68 | + Proteomics = PDX_Proteomics) %>% |
| 69 | + filter(!is.na(improve_sample_id)) |
| 70 | + |
| 71 | +# Combine all sample tables |
| 72 | +dcombined <- bind_rows(pdx_data, tumor_data, mt_data) %>% distinct() |
| 73 | +print("dcombined:") |
| 74 | +print(dcombined) |
| 75 | + |
| 76 | +# Helper to assign study label based on model_type |
| 77 | +study_label <- function(type) { |
| 78 | + case_when( |
| 79 | + type == "patient derived xenograft" ~ "MPNST PDX", |
| 80 | + type == "tumor" ~ "MPNST Tumor", |
| 81 | + type == "xenograft derived organoid" ~ "MPNST PDX MT", |
| 82 | + TRUE ~ "MPNST" |
| 83 | + ) |
| 84 | +} |
| 85 | + |
| 86 | +# Helper to pick metadata based on sample ID and column |
| 87 | +pick_meta <- function(id, column) { |
| 88 | + # columns are {"Proteomics","RNASeq","Mutations","CopyNumber"} |
| 89 | + if (any(tumor_data[[column]] == id, na.rm = TRUE)) { |
| 90 | + sdf <- tumor_data %>% filter(.data[[column]] == id) %>% slice(1) |
| 91 | + } else if (any(mt_data[[column]] == id, na.rm = TRUE)) { |
| 92 | + sdf <- mt_data %>% filter(.data[[column]] == id) %>% slice(1) |
| 93 | + } else if (any(pdx_data[[column]] == id, na.rm = TRUE)) { |
| 94 | + sdf <- pdx_data %>% filter(.data[[column]] == id) %>% slice(1) |
| 95 | + } else { |
| 96 | + return(NULL) |
| 97 | + } |
| 98 | + list( |
| 99 | + sample_id = sdf$improve_sample_id, |
| 100 | + model_type = sdf$model_type |
| 101 | + ) |
| 102 | +} |
| 103 | + |
| 104 | +# Safe extraction: only return non-empty data frames |
| 105 | +i_safe_extract <- function(df, sample_id, source_val, study_val) { |
| 106 | + if (is.null(df) || nrow(df) == 0) return(NULL) |
| 107 | + df$improve_sample_id <- sample_id |
| 108 | + df$source <- source_val |
| 109 | + df$study <- study_val |
| 110 | + df |
| 111 | +} |
| 112 | + |
| 113 | +# 1) Proteomics |
| 114 | +proteomics_list <- lapply( |
| 115 | + setdiff(dcombined$Proteomics, c("", NA, "NA")), |
| 116 | + function(id) { |
| 117 | + meta <- pick_meta(id, "Proteomics") |
| 118 | + if (is.null(meta)) return(NULL) |
| 119 | + |
| 120 | + df <- tryCatch( |
| 121 | + fread(synGet(id)$path) %>% |
| 122 | + rename(gene_symbol = Gene) %>% |
| 123 | + left_join(genes_df, by = "gene_symbol") %>% |
| 124 | + select(entrez_id, proteomics = logRatio) %>% |
| 125 | + filter(!is.na(entrez_id), proteomics != 0) %>% |
| 126 | + distinct(), |
| 127 | + error = function(e) NULL |
| 128 | + ) |
| 129 | + i_safe_extract( |
| 130 | + df, |
| 131 | + meta$sample_id, |
| 132 | + "NF Data Portal", |
| 133 | + study_label(meta$model_type) |
| 134 | + ) |
| 135 | + } |
| 136 | +) |
| 137 | +proteomics <- bind_rows(proteomics_list) |
| 138 | +fwrite(proteomics, file.path("/tmp", "mpnst_proteomics.csv")) |
| 139 | +message("Wrote combined proteomics") |
| 140 | + |
| 141 | + |
| 142 | +# 2) Transcriptomics (PDX, Tumor, and Organoid / MT which comes from PDX..) |
| 143 | +transcriptomics_list <- lapply( |
| 144 | + setdiff(dcombined$RNASeq, c("", NA, "NA")), |
| 145 | + function(id) { |
| 146 | + meta <- pick_meta(id, "RNASeq") |
| 147 | + if (is.null(meta)) return(NULL) |
| 148 | + |
| 149 | + df <- tryCatch({ |
| 150 | + fread(synGet(id)$path) %>% |
| 151 | + separate(Name, into = c("other_id","vers"), sep = "\\.") %>% |
| 152 | + select(-vers) %>% |
| 153 | + left_join(genes_df) %>% |
| 154 | + select(entrez_id, transcriptomics = TPM) %>% |
| 155 | + filter(!is.na(entrez_id), transcriptomics != 0) %>% |
| 156 | + distinct() |
| 157 | + }, error = function(e) NULL) |
| 158 | + |
| 159 | + i_safe_extract( |
| 160 | + df, |
| 161 | + meta$sample_id, |
| 162 | + "NF Data Portal", |
| 163 | + study_label(meta$model_type) |
| 164 | + ) |
| 165 | + } |
| 166 | +) |
| 167 | +transcriptomics <- bind_rows(transcriptomics_list) |
| 168 | +fwrite(transcriptomics, file.path("/tmp", "mpnst_transcriptomics.csv")) |
| 169 | +message("Wrote combined transcriptomics") |
| 170 | + |
| 171 | + |
| 172 | +# 3) Mutations (WES) |
| 173 | +wes_list <- lapply( |
| 174 | + setdiff(dcombined$Mutations, c("", NA, "NA")), |
| 175 | + function(id) { |
| 176 | + meta <- pick_meta(id, "Mutations") |
| 177 | + if (is.null(meta)) return(NULL) |
| 178 | + |
| 179 | + clean_id <- gsub('[\"\\[\\]]', '', id) |
| 180 | + df <- tryCatch( |
| 181 | + fread(synGet(clean_id)$path) %>% |
| 182 | + select(entrez_id = Entrez_Gene_Id, |
| 183 | + mutation = HGVSc, |
| 184 | + variant_classification = Variant_Classification) %>% |
| 185 | + filter(entrez_id %in% genes_df$entrez_id) %>% |
| 186 | + distinct(), |
| 187 | + error = function(e) NULL |
| 188 | + ) |
| 189 | + |
| 190 | + i_safe_extract( |
| 191 | + df, |
| 192 | + meta$sample_id, |
| 193 | + "NF Data Portal", |
| 194 | + study_label(meta$model_type) |
| 195 | + ) |
| 196 | + } |
| 197 | +) |
| 198 | +wes <- bind_rows(wes_list) |
| 199 | +fwrite(wes, file.path("/tmp", "mpnst_mutations.csv")) |
| 200 | +message("Wrote combined mutations") |
| 201 | + |
| 202 | + |
| 203 | +# 4) Copy Number Variation (CNV) |
| 204 | +cnv_list <- lapply( |
| 205 | + setdiff(dcombined$CopyNumber, c("", NA, "NA")), |
| 206 | + function(id) { |
| 207 | + meta <- pick_meta(id, "CopyNumber") |
| 208 | + if (is.null(meta)) return(NULL) |
| 209 | + |
| 210 | + clean_id <- gsub('[\"\\[\\]]', '', id) |
| 211 | + raw <- tryCatch(fread(synGet(clean_id)$path), error = function(e) NULL) |
| 212 | + if (is.null(raw)) return(NULL) |
| 213 | + |
| 214 | + df_long <- raw %>% |
| 215 | + separate_rows(gene, sep = ",") %>% |
| 216 | + rename(gene_symbol = gene) %>% |
| 217 | + left_join(genes_df, by = "gene_symbol") %>% |
| 218 | + filter(!is.na(entrez_id)) %>% |
| 219 | + select(entrez_id, log2) %>% |
| 220 | + distinct() %>% |
| 221 | + mutate(copy_number = 2^log2) %>% |
| 222 | + select(-log2) |
| 223 | + |
| 224 | + df <- df_long %>% |
| 225 | + mutate(copy_call = case_when( |
| 226 | + copy_number < 0.5210507 ~ "deep del", |
| 227 | + copy_number < 0.7311832 ~ "het loss", |
| 228 | + copy_number < 1.214125 ~ "diploid", |
| 229 | + copy_number < 1.422233 ~ "gain", |
| 230 | + TRUE ~ "amp" |
| 231 | + )) |
| 232 | + |
| 233 | + i_safe_extract( |
| 234 | + df, |
| 235 | + meta$sample_id, |
| 236 | + "NF Data Portal", |
| 237 | + study_label(meta$model_type) |
| 238 | + ) |
| 239 | + } |
| 240 | +) |
| 241 | +cnv <- bind_rows(cnv_list) |
| 242 | +fwrite(cnv, file.path("/tmp", "mpnst_copy_number.csv")) |
| 243 | +message("Wrote combined copy number") |
| 244 | + |
| 245 | + |
| 246 | +message("All combined data files created.") |
0 commit comments