Skip to content

Commit 481385e

Browse files
committed
Datasets Merged, Build works, Validate works
1 parent 004b730 commit 481385e

11 files changed

Lines changed: 637 additions & 530 deletions

build/mpnst/00_sample_gen.R

100755100644
Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
# This script generate a new sample table based on pervious beatAML improved sample ID
2-
# It will take the maximum value of beatAML improved sample ID and continue from ID count from there
1+
# This script generate a new sample table based on previous dataset's sample file (taking the max improve_sample_id)
32
# Load required libraries
43
library(data.table)
54
library(synapser)

build/mpnst/01_combined_omics.R

Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
#!/usr/bin/env Rscript
2+
3+
# Combined MPNST & MPNST-PDX Data Extraction Script
4+
# This script unifies data extraction for PDX, Tumor, and Xenograft-Derived Organoid samples.
5+
6+
# Load required libraries
7+
library(data.table)
8+
library(synapser)
9+
library(dplyr)
10+
library(tidyr)
11+
12+
# Retrieve command line arguments
13+
args <- commandArgs(trailingOnly = TRUE)
14+
if (length(args) < 3) {
15+
stop("Usage: Rscript 01_combined_omics.R <PAT> <samples.csv> <genes.csv>", call. = FALSE)
16+
}
17+
PAT <- args[1]
18+
samples <- args[2]
19+
genes <- args[3]
20+
21+
# Log in to Synapse
22+
token <- PAT
23+
synLogin(authToken = token)
24+
25+
# Read sample mapping and gene mapping
26+
samples_df <- fread(samples) %>%
27+
select(improve_sample_id, common_name, model_type) %>%
28+
distinct()
29+
genes_df <- fread(genes)
30+
31+
# Subset by model type
32+
pdx_samps <- filter(samples_df, model_type == "patient derived xenograft")
33+
tumor_samps<- filter(samples_df, model_type == "tumor")
34+
mt_samps <- filter(samples_df, model_type == "xenograft derived organoid") # These end up being the same as pdx_samps in the manifest.
35+
36+
# Retrieve manifest table from Synapse
37+
manifest <- synTableQuery("select * from syn53503360")$asDataFrame() %>%
38+
rename(common_name = Sample)
39+
40+
print("manifest")
41+
print(manifest)
42+
43+
# Build sample tables
44+
pdx_data <- manifest %>%
45+
select(common_name, starts_with("PDX")) %>%
46+
left_join(pdx_samps, by = "common_name") %>%
47+
select(improve_sample_id, common_name, model_type,
48+
RNASeq = PDX_RNASeq,
49+
Mutations = PDX_Somatic_Mutations,
50+
CopyNumber = PDX_CNV,
51+
Proteomics = PDX_Proteomics) %>%
52+
filter(!is.na(improve_sample_id))
53+
54+
55+
tumor_data <- manifest %>%
56+
select(common_name, starts_with("Tumor")) %>%
57+
left_join(tumor_samps, by = "common_name") %>%
58+
select(improve_sample_id, common_name, model_type,
59+
RNASeq = Tumor_RNASeq,
60+
Mutations = Tumor_Somatic_Mutations,
61+
CopyNumber = Tumor_CNV) %>%
62+
mutate(Proteomics = "") %>%
63+
filter(!is.na(improve_sample_id))
64+
65+
mt_data <- manifest %>% #Note, this is the same as pdx_data but I think we default to "xenograft derived organoid" if present.
66+
select(common_name, starts_with("PDX")) %>%
67+
left_join(mt_samps, by = "common_name") %>%
68+
select(improve_sample_id, common_name, model_type,
69+
RNASeq = PDX_RNASeq,
70+
Mutations = PDX_Somatic_Mutations,
71+
CopyNumber = PDX_CNV,
72+
Proteomics = PDX_Proteomics) %>%
73+
filter(!is.na(improve_sample_id))
74+
75+
# Combine all sample tables
76+
dcombined <- bind_rows(pdx_data, tumor_data, mt_data) %>% distinct()
77+
print("dcombined:")
78+
print(dcombined)
79+
80+
# Helper to assign study label based on model_type
81+
study_label <- function(type) {
82+
case_when(
83+
type == "patient derived xenograft" ~ "MPNST PDX",
84+
type == "tumor" ~ "MPNST Tumor",
85+
type == "xenograft derived organoid" ~ "MPNST PDX MT",
86+
TRUE ~ "MPNST"
87+
)
88+
}
89+
90+
# Helper to pick metadata based on sample ID and column
91+
pick_meta <- function(id, column) {
92+
# column {"Proteomics","RNASeq","Mutations","CopyNumber"}
93+
if (any(tumor_data[[column]] == id, na.rm = TRUE)) {
94+
sdf <- tumor_data %>% filter(.data[[column]] == id) %>% slice(1)
95+
} else if (any(mt_data[[column]] == id, na.rm = TRUE)) {
96+
sdf <- mt_data %>% filter(.data[[column]] == id) %>% slice(1)
97+
} else if (any(pdx_data[[column]] == id, na.rm = TRUE)) {
98+
sdf <- pdx_data %>% filter(.data[[column]] == id) %>% slice(1)
99+
} else {
100+
return(NULL)
101+
}
102+
list(
103+
sample_id = sdf$improve_sample_id,
104+
model_type = sdf$model_type
105+
)
106+
}
107+
108+
# Safe extraction: only return non-empty data frames
109+
i_safe_extract <- function(df, sample_id, source_val, study_val) {
110+
if (is.null(df) || nrow(df) == 0) return(NULL)
111+
df$improve_sample_id <- sample_id
112+
df$source <- source_val
113+
df$study <- study_val
114+
df
115+
}
116+
117+
# 1) Proteomics
118+
proteomics_list <- lapply(
119+
setdiff(dcombined$Proteomics, c("", NA, "NA")),
120+
function(id) {
121+
meta <- pick_meta(id, "Proteomics")
122+
if (is.null(meta)) return(NULL)
123+
124+
df <- tryCatch(
125+
fread(synGet(id)$path) %>%
126+
rename(gene_symbol = Gene) %>%
127+
left_join(genes_df, by = "gene_symbol") %>%
128+
select(entrez_id, proteomics = logRatio) %>%
129+
filter(!is.na(entrez_id), proteomics != 0) %>%
130+
distinct(),
131+
error = function(e) NULL
132+
)
133+
i_safe_extract(
134+
df,
135+
meta$sample_id,
136+
"NF Data Portal",
137+
study_label(meta$model_type)
138+
)
139+
}
140+
)
141+
proteomics <- bind_rows(proteomics_list)
142+
fwrite(proteomics, file.path("/tmp", "mpnst_proteomics.csv"))
143+
message("Wrote combined proteomics")
144+
145+
146+
# 2) Transcriptomics (PDX, Tumor, and Organoid / MT which comes from PDX..)
147+
transcriptomics_list <- lapply(
148+
setdiff(dcombined$RNASeq, c("", NA, "NA")),
149+
function(id) {
150+
meta <- pick_meta(id, "RNASeq")
151+
if (is.null(meta)) return(NULL)
152+
153+
df <- tryCatch({
154+
fread(synGet(id)$path) %>%
155+
separate(Name, into = c("other_id","vers"), sep = "\\.") %>%
156+
select(-vers) %>%
157+
left_join(genes_df) %>%
158+
select(entrez_id, transcriptomics = TPM) %>%
159+
filter(!is.na(entrez_id), transcriptomics != 0) %>%
160+
distinct()
161+
}, error = function(e) NULL)
162+
163+
i_safe_extract(
164+
df,
165+
meta$sample_id,
166+
"NF Data Portal",
167+
study_label(meta$model_type)
168+
)
169+
}
170+
)
171+
transcriptomics <- bind_rows(transcriptomics_list)
172+
fwrite(transcriptomics, file.path("/tmp", "mpnst_transcriptomics.csv"))
173+
message("Wrote combined transcriptomics")
174+
175+
176+
# 3) Mutations (WES)
177+
wes_list <- lapply(
178+
setdiff(dcombined$Mutations, c("", NA, "NA")),
179+
function(id) {
180+
meta <- pick_meta(id, "Mutations")
181+
if (is.null(meta)) return(NULL)
182+
183+
clean_id <- gsub('[\"\\[\\]]', '', id)
184+
df <- tryCatch(
185+
fread(synGet(clean_id)$path) %>%
186+
select(entrez_id = Entrez_Gene_Id,
187+
mutation = HGVSc,
188+
variant_classification = Variant_Classification) %>%
189+
filter(entrez_id %in% genes_df$entrez_id) %>%
190+
distinct(),
191+
error = function(e) NULL
192+
)
193+
194+
i_safe_extract(
195+
df,
196+
meta$sample_id,
197+
"NF Data Portal",
198+
study_label(meta$model_type)
199+
)
200+
}
201+
)
202+
wes <- bind_rows(wes_list)
203+
fwrite(wes, file.path("/tmp", "mpnst_mutations.csv"))
204+
message("Wrote combined mutations")
205+
206+
207+
# 4) Copy Number Variation (CNV)
208+
cnv_list <- lapply(
209+
setdiff(dcombined$CopyNumber, c("", NA, "NA")),
210+
function(id) {
211+
meta <- pick_meta(id, "CopyNumber")
212+
if (is.null(meta)) return(NULL)
213+
214+
clean_id <- gsub('[\"\\[\\]]', '', id)
215+
raw <- tryCatch(fread(synGet(clean_id)$path), error = function(e) NULL)
216+
if (is.null(raw)) return(NULL)
217+
218+
df_long <- raw %>%
219+
separate_rows(gene, sep = ",") %>%
220+
rename(gene_symbol = gene) %>%
221+
left_join(genes_df, by = "gene_symbol") %>%
222+
filter(!is.na(entrez_id)) %>%
223+
select(entrez_id, log2) %>%
224+
distinct() %>%
225+
mutate(copy_number = 2^log2) %>%
226+
select(-log2)
227+
228+
df <- df_long %>%
229+
mutate(copy_call = case_when(
230+
copy_number < 0.5210507 ~ "deep del",
231+
copy_number < 0.7311832 ~ "het loss",
232+
copy_number < 1.214125 ~ "diploid",
233+
copy_number < 1.422233 ~ "gain",
234+
TRUE ~ "amp"
235+
))
236+
237+
i_safe_extract(
238+
df,
239+
meta$sample_id,
240+
"NF Data Portal",
241+
study_label(meta$model_type)
242+
)
243+
}
244+
)
245+
cnv <- bind_rows(cnv_list)
246+
fwrite(cnv, file.path("/tmp", "mpnst_copy_number.csv"))
247+
message("Wrote combined copy number")
248+
249+
250+
message("All combined data files created.")

0 commit comments

Comments
 (0)