Skip to content

Commit fbab783

Browse files
authored
Merge pull request #425 from PNNL-CompBio/mpnst_dataset_join
Combine mpnst and mpnstpdx Datasets
2 parents 618b21b + 8c2f4c0 commit fbab783

25 files changed

Lines changed: 683 additions & 1170 deletions

build/build_all.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def main():
4040
parser.add_argument('--figshare', action='store_true', help="Upload all local data to Figshare. FIGSHARE_TOKEN must be set in local environment.")
4141
parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate or figshare commands")
4242
parser.add_argument('--high_mem',dest='high_mem',default=False,action='store_true',help = "If you have 32 or more CPUs, this option is recommended. It will run many code portions in parallel. If you don't have enough memory, this will cause a run failure.")
43-
parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx,pancpdo,bladderpdo,sarcpdo,liverpdo',help='Datasets to process. Defaults to all available.')
43+
parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,pancpdo,bladderpdo,sarcpdo,liverpdo,mpnst',help='Datasets to process. Defaults to all available.')
4444
parser.add_argument('--version', type=str, required=False, help='Version number for the Figshare upload title (e.g., "0.1.29"). This is required for Figshare upload. This must be a higher version than previously published versions.')
4545
parser.add_argument('--github-username', type=str, required=False, help='GitHub username for the repository.')
4646
parser.add_argument('--github-email', type=str, required=False, help='GitHub email for the repository.')
@@ -119,7 +119,6 @@ def process_docker(datasets):
119119
'hcmi': ['hcmi'],
120120
'beataml': ['beataml'],
121121
'mpnst': ['mpnst'],
122-
'mpnstpdx': ['mpnstpdx'],
123122
'pancpdo': ['pancpdo'],
124123
'bladderpdo': ['bladderpdo'],
125124
'sarcpdo': ['sarcpdo'],
@@ -410,7 +409,7 @@ def get_latest_commit_hash(owner, repo, branch='main'):
410409
# if args.figshare or args.validate:
411410
# FigShare File Prefixes:
412411

413-
prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'mpnstpdx', 'pancpdo', 'bladderpdo','sarcpdo', 'genes', 'drugs', 'liverpdo']
412+
prefixes = ['beataml', 'hcmi', 'cptac', 'pancpdo', 'bladderpdo','sarcpdo', 'genes', 'drugs', 'liverpdo','mpnst']
414413
broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"]
415414
if "broad_sanger" in datasets:
416415
prefixes.extend(broad_sanger_datasets)

build/build_dataset.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ def process_docker(dataset,validate):
4141
'hcmi': ['hcmi'],
4242
'beataml': ['beataml'],
4343
'mpnst': ['mpnst'],
44-
'mpnstpdx': ['mpnstpdx'],
4544
'pancpdo': ['pancpdo'],
4645
'cptac': ['cptac'],
4746
'sarcpdo': ['sarcpdo'],
@@ -128,7 +127,6 @@ def process_omics(executor, dataset, should_continue):
128127
'broad_sanger': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
129128
'cptac': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
130129
'hcmi': ['mutations', 'transcriptomics'],
131-
'mpnstpdx':['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
132130
'sarcpdo': ['mutations', 'transcriptomics'],
133131
'pancpdo': ['transcriptomics'],
134132
'bladderpdo': ['copy_number', 'mutations', 'transcriptomics'],

build/mpnst/00_sample_gen.R

100755100644
Lines changed: 5 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
# This script generate a new sample table based on pervious beatAML improved sample ID
2-
# It will take the maximum value of beatAML improved sample ID and continue from ID count from there
1+
# This script generate a new sample table based on previous dataset's sample file (taking the max improve_sample_id)
32
# Load required libraries
43
library(data.table)
54
library(synapser)
@@ -11,14 +10,12 @@ if(length(args) > 1 ){
1110
stop("Up to one argument is allowed. This is the filepath to the previously run samples file.")
1211
}
1312

14-
1513
if (length(args) == 0 || is.na(args[1]) || args[1] == "" || !file.exists(args[1])) {
1614
orig_samples <- ""
1715
} else {
1816
orig_samples <- fread(args[1])
1917
}
2018

21-
2219
# Check if Synapse token is available from the environment
2320
synapse_token <- Sys.getenv("SYNAPSE_AUTH_TOKEN")
2421
if (synapse_token == "") {
@@ -29,6 +26,10 @@ synapser::synLogin(authToken=synapse_token)
2926
manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|>
3027
as.data.frame()
3128

29+
#Drop contaminated sample JH-2-009
30+
manifest <- manifest %>%
31+
filter(Sample != "JH-2-009")
32+
3233

3334
###sample file has a strict schema
3435
## - improve_sample_id
@@ -62,31 +63,13 @@ main<-rbind(sampTable,pdxmt)|>
6263
dplyr::select(-MicroTissueDrugFolder)|>
6364
rbind(tumorTable)
6465

65-
#main <- fread("mpnst/NF_MPNST_samples.csv")
66-
#previous_aml <- fread(args[1])#"beatAML/beataml_samples.csv")
67-
6866
# If there is no previous samples file - start at 1, else, continue where the previous one left off.
6967
if (identical(orig_samples, "")) {
7068
max_id <- 1
7169
} else {
7270
max_id <- max(orig_samples$improve_sample_id, na.rm = TRUE)
7371
}
7472

75-
7673
main$improve_sample_id <- seq(from = max_id + 1, length.out = nrow(main))
7774

78-
#synapse_main <- fread("mpnst/synapse_NF-MPNST_samples.csv")
79-
# Step 1: Create a dictionary from 'main'
80-
#id_dict <- setNames(main$improve_sample_id, main$other_id)
81-
82-
# Step 2: Update 'ID' in 'synapse_main'
83-
#synapse_main$ID <- id_dict[synapse_main$Sample]
84-
85-
# Handling NA values if any mismatch occurs (Optional based on your data integrity)
86-
# If there are NAs generated, you might need to check for unmatched keys
87-
# synapse_main$ID[is.na(synapse_main$ID)] <- -1 # Assign a placeholder like -1 for unmatched rows
88-
89-
# Step 3: Save the updated 'synapse_main'
90-
#fwrite(synapse_main, "mpnst/synapse_NF-MPNST_samples.csv")
91-
#fwrite(main, "mpnst/NF_MPNST_samples.csv") # updated sample file
9275
fwrite(main,'/tmp/mpnst_samples.csv')

build/mpnst/01_combined_omics.R

Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,246 @@
1+
#!/usr/bin/env Rscript
2+
3+
# Combined MPNST & MPNST-PDX Data Extraction Script
4+
# This script unifies data extraction for PDX, Tumor, and Xenograft-Derived Organoid samples.
5+
6+
# Load required libraries
7+
library(data.table)
8+
library(synapser)
9+
library(dplyr)
10+
library(tidyr)
11+
12+
# Retrieve command line arguments
13+
args <- commandArgs(trailingOnly = TRUE)
14+
if (length(args) < 3) {
15+
stop("Usage: Rscript 01_combined_omics.R <PAT> <samples.csv> <genes.csv>", call. = FALSE)
16+
}
17+
PAT <- args[1]
18+
samples <- args[2]
19+
genes <- args[3]
20+
21+
# Log in to Synapse
22+
token <- PAT
23+
synLogin(authToken = token)
24+
25+
# Read sample mapping and gene mapping
26+
samples_df <- fread(samples) %>%
27+
select(improve_sample_id, common_name, model_type) %>%
28+
distinct()
29+
genes_df <- fread(genes)
30+
31+
# Subset by model type
32+
pdx_samps <- filter(samples_df, model_type == "patient derived xenograft")
33+
tumor_samps<- filter(samples_df, model_type == "tumor")
34+
mt_samps <- filter(samples_df, model_type == "xenograft derived organoid") # These end up being the same as pdx_samps in the manifest.
35+
36+
# Retrieve manifest table from Synapse
37+
manifest <- synTableQuery("select * from syn53503360")$asDataFrame() %>%
38+
rename(common_name = Sample)
39+
40+
# Build sample tables
41+
pdx_data <- manifest %>%
42+
select(common_name, starts_with("PDX")) %>%
43+
left_join(pdx_samps, by = "common_name") %>%
44+
select(improve_sample_id, common_name, model_type,
45+
RNASeq = PDX_RNASeq,
46+
Mutations = PDX_Somatic_Mutations,
47+
CopyNumber = PDX_CNV,
48+
Proteomics = PDX_Proteomics) %>%
49+
filter(!is.na(improve_sample_id))
50+
51+
tumor_data <- manifest %>%
52+
select(common_name, starts_with("Tumor")) %>%
53+
left_join(tumor_samps, by = "common_name") %>%
54+
select(improve_sample_id, common_name, model_type,
55+
RNASeq = Tumor_RNASeq,
56+
Mutations = Tumor_Somatic_Mutations,
57+
CopyNumber = Tumor_CNV) %>%
58+
mutate(Proteomics = "") %>%
59+
filter(!is.na(improve_sample_id))
60+
61+
mt_data <- manifest %>% #Note, this is the same as pdx_data but I think we default to "xenograft derived organoid" if present (based on original files)
62+
select(common_name, starts_with("PDX")) %>%
63+
left_join(mt_samps, by = "common_name") %>%
64+
select(improve_sample_id, common_name, model_type,
65+
RNASeq = PDX_RNASeq,
66+
Mutations = PDX_Somatic_Mutations,
67+
CopyNumber = PDX_CNV,
68+
Proteomics = PDX_Proteomics) %>%
69+
filter(!is.na(improve_sample_id))
70+
71+
# Combine all sample tables
72+
dcombined <- bind_rows(pdx_data, tumor_data, mt_data) %>% distinct()
73+
print("dcombined:")
74+
print(dcombined)
75+
76+
# Helper to assign study label based on model_type
77+
study_label <- function(type) {
78+
case_when(
79+
type == "patient derived xenograft" ~ "MPNST PDX",
80+
type == "tumor" ~ "MPNST Tumor",
81+
type == "xenograft derived organoid" ~ "MPNST PDX MT",
82+
TRUE ~ "MPNST"
83+
)
84+
}
85+
86+
# Helper to pick metadata based on sample ID and column
87+
pick_meta <- function(id, column) {
88+
# columns are {"Proteomics","RNASeq","Mutations","CopyNumber"}
89+
if (any(tumor_data[[column]] == id, na.rm = TRUE)) {
90+
sdf <- tumor_data %>% filter(.data[[column]] == id) %>% slice(1)
91+
} else if (any(mt_data[[column]] == id, na.rm = TRUE)) {
92+
sdf <- mt_data %>% filter(.data[[column]] == id) %>% slice(1)
93+
} else if (any(pdx_data[[column]] == id, na.rm = TRUE)) {
94+
sdf <- pdx_data %>% filter(.data[[column]] == id) %>% slice(1)
95+
} else {
96+
return(NULL)
97+
}
98+
list(
99+
sample_id = sdf$improve_sample_id,
100+
model_type = sdf$model_type
101+
)
102+
}
103+
104+
# Safe extraction: only return non-empty data frames
105+
i_safe_extract <- function(df, sample_id, source_val, study_val) {
106+
if (is.null(df) || nrow(df) == 0) return(NULL)
107+
df$improve_sample_id <- sample_id
108+
df$source <- source_val
109+
df$study <- study_val
110+
df
111+
}
112+
113+
# 1) Proteomics
114+
proteomics_list <- lapply(
115+
setdiff(dcombined$Proteomics, c("", NA, "NA")),
116+
function(id) {
117+
meta <- pick_meta(id, "Proteomics")
118+
if (is.null(meta)) return(NULL)
119+
120+
df <- tryCatch(
121+
fread(synGet(id)$path) %>%
122+
rename(gene_symbol = Gene) %>%
123+
left_join(genes_df, by = "gene_symbol") %>%
124+
select(entrez_id, proteomics = logRatio) %>%
125+
filter(!is.na(entrez_id), proteomics != 0) %>%
126+
distinct(),
127+
error = function(e) NULL
128+
)
129+
i_safe_extract(
130+
df,
131+
meta$sample_id,
132+
"NF Data Portal",
133+
study_label(meta$model_type)
134+
)
135+
}
136+
)
137+
proteomics <- bind_rows(proteomics_list)
138+
fwrite(proteomics, file.path("/tmp", "mpnst_proteomics.csv"))
139+
message("Wrote combined proteomics")
140+
141+
142+
# 2) Transcriptomics (PDX, Tumor, and Organoid / MT which comes from PDX..)
143+
transcriptomics_list <- lapply(
144+
setdiff(dcombined$RNASeq, c("", NA, "NA")),
145+
function(id) {
146+
meta <- pick_meta(id, "RNASeq")
147+
if (is.null(meta)) return(NULL)
148+
149+
df <- tryCatch({
150+
fread(synGet(id)$path) %>%
151+
separate(Name, into = c("other_id","vers"), sep = "\\.") %>%
152+
select(-vers) %>%
153+
left_join(genes_df) %>%
154+
select(entrez_id, transcriptomics = TPM) %>%
155+
filter(!is.na(entrez_id), transcriptomics != 0) %>%
156+
distinct()
157+
}, error = function(e) NULL)
158+
159+
i_safe_extract(
160+
df,
161+
meta$sample_id,
162+
"NF Data Portal",
163+
study_label(meta$model_type)
164+
)
165+
}
166+
)
167+
transcriptomics <- bind_rows(transcriptomics_list)
168+
fwrite(transcriptomics, file.path("/tmp", "mpnst_transcriptomics.csv"))
169+
message("Wrote combined transcriptomics")
170+
171+
172+
# 3) Mutations (WES)
173+
wes_list <- lapply(
174+
setdiff(dcombined$Mutations, c("", NA, "NA")),
175+
function(id) {
176+
meta <- pick_meta(id, "Mutations")
177+
if (is.null(meta)) return(NULL)
178+
179+
clean_id <- gsub('[\"\\[\\]]', '', id)
180+
df <- tryCatch(
181+
fread(synGet(clean_id)$path) %>%
182+
select(entrez_id = Entrez_Gene_Id,
183+
mutation = HGVSc,
184+
variant_classification = Variant_Classification) %>%
185+
filter(entrez_id %in% genes_df$entrez_id) %>%
186+
distinct(),
187+
error = function(e) NULL
188+
)
189+
190+
i_safe_extract(
191+
df,
192+
meta$sample_id,
193+
"NF Data Portal",
194+
study_label(meta$model_type)
195+
)
196+
}
197+
)
198+
wes <- bind_rows(wes_list)
199+
fwrite(wes, file.path("/tmp", "mpnst_mutations.csv"))
200+
message("Wrote combined mutations")
201+
202+
203+
# 4) Copy Number Variation (CNV)
204+
cnv_list <- lapply(
205+
setdiff(dcombined$CopyNumber, c("", NA, "NA")),
206+
function(id) {
207+
meta <- pick_meta(id, "CopyNumber")
208+
if (is.null(meta)) return(NULL)
209+
210+
clean_id <- gsub('[\"\\[\\]]', '', id)
211+
raw <- tryCatch(fread(synGet(clean_id)$path), error = function(e) NULL)
212+
if (is.null(raw)) return(NULL)
213+
214+
df_long <- raw %>%
215+
separate_rows(gene, sep = ",") %>%
216+
rename(gene_symbol = gene) %>%
217+
left_join(genes_df, by = "gene_symbol") %>%
218+
filter(!is.na(entrez_id)) %>%
219+
select(entrez_id, log2) %>%
220+
distinct() %>%
221+
mutate(copy_number = 2^log2) %>%
222+
select(-log2)
223+
224+
df <- df_long %>%
225+
mutate(copy_call = case_when(
226+
copy_number < 0.5210507 ~ "deep del",
227+
copy_number < 0.7311832 ~ "het loss",
228+
copy_number < 1.214125 ~ "diploid",
229+
copy_number < 1.422233 ~ "gain",
230+
TRUE ~ "amp"
231+
))
232+
233+
i_safe_extract(
234+
df,
235+
meta$sample_id,
236+
"NF Data Portal",
237+
study_label(meta$model_type)
238+
)
239+
}
240+
)
241+
cnv <- bind_rows(cnv_list)
242+
fwrite(cnv, file.path("/tmp", "mpnst_copy_number.csv"))
243+
message("Wrote combined copy number")
244+
245+
246+
message("All combined data files created.")

0 commit comments

Comments
 (0)