Skip to content

Commit bd4d366

Browse files
committed
updates and fixes
2 parents 163000a + c4f0db8 commit bd4d366

19 files changed

Lines changed: 1222 additions & 43 deletions

build/build_dataset.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ def process_docker(dataset,validate):
4141
'hcmi': ['hcmi'],
4242
'beataml': ['beataml'],
4343
'mpnst': ['mpnst'],
44+
'mpnstpdx': ['mpnstpdx'],
4445
'cptac': ['cptac'],
4546
'genes': ['genes'],
4647
'upload': ['upload']
@@ -121,7 +122,8 @@ def process_omics(executor, dataset, should_continue):
121122
'mpnst': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
122123
'broad_sanger': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
123124
'cptac': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
124-
'hcmi': ['mutations', 'transcriptomics']
125+
'hcmi': ['mutations', 'transcriptomics'],
126+
'mpnstpdx':['copy_number', 'mutations', 'proteomics', 'transcriptomics']
125127
}
126128

127129
expected_omics = dataset_omics_files.get(dataset, [])

build/docker/Dockerfile.mpnstPDX

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
FROM r-base:4.3.2
2+
3+
# Set environment to noninteractive
4+
ENV DEBIAN_FRONTEND=noninteractive
5+
6+
# Update package list and install required packages
7+
RUN apt-get update && \
8+
apt-get install -y build-essential wget curl libcurl4-openssl-dev libxml2-dev \
9+
zlib1g-dev libssl-dev libbz2-dev libreadline-dev libsqlite3-dev libffi-dev
10+
11+
# Download and compile Python 3.10 with shared library support
12+
RUN wget https://www.python.org/ftp/python/3.10.12/Python-3.10.12.tgz && \
13+
tar -xf Python-3.10.12.tgz && \
14+
cd Python-3.10.12 && \
15+
./configure --enable-optimizations --enable-shared && \
16+
make -j$(nproc) && \
17+
make altinstall && \
18+
cd .. && \
19+
rm -rf Python-3.10.12.tgz Python-3.10.12
20+
21+
# Set Python 3.10 as default
22+
RUN ln -s /usr/local/bin/python3.10 /usr/bin/python3 && \
23+
ln -s /usr/local/bin/pip3.10 /usr/bin/pip3
24+
25+
# Update library paths for Python shared library
26+
RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/python3.10.conf && ldconfig
27+
28+
# Create a Python virtual environment
29+
RUN python3 -m venv /opt/venv
30+
RUN /opt/venv/bin/pip install --upgrade pip
31+
32+
# Set environment variables for reticulate
33+
ENV RETICULATE_PYTHON="/opt/venv/bin/python3"
34+
ENV PYTHONPATH=/app#"${PYTHONPATH}:/app"
35+
WORKDIR /app
36+
37+
# Set MPLCONFIGDIR to a writable directory
38+
ENV MPLCONFIGDIR=/app/tmp/matplotlib
39+
RUN mkdir -p /app/tmp/matplotlib
40+
41+
# Add necessary files to the container
42+
ADD build/mpnstpdx/requirements.txt .
43+
ADD build/mpnstpdx/requirements.r .
44+
ADD build/mpnstpdx/* ./
45+
ADD build/utils/* ./
46+
47+
# installing python libraries
48+
RUN /opt/venv/bin/pip3 install -r requirements.txt
49+
50+
# Install all R libraries from requirements.r
51+
RUN Rscript requirements.r
52+
53+
# Set up volume for temporary storage
54+
VOLUME ["/tmp"]

build/docker/docker-compose.yml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,15 @@ services:
4444
HTTPS_PROXY: ${HTTPS_PROXY}
4545
platform: linux/amd64
4646
image: mpnst:latest
47-
47+
48+
mpnstpdx:
49+
build:
50+
context: ../../
51+
dockerfile: build/docker/Dockerfile.mpnstpdx
52+
args:
53+
HTTPS_PROXY: ${HTTPS_PROXY}
54+
platform: linux/amd64
55+
image: mpnstpdx:latest
4856
cptac:
4957
build:
5058
context: ../../

build/mpnst/00_sample_gen.R

Lines changed: 13 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -7,25 +7,14 @@ library(dplyr)
77

88
##adding a command line argument
99
args = commandArgs(trailingOnly=TRUE)
10-
if(length(args) > 1 ){
11-
stop("Up to one argument is allowed. This is the filepath to the previously run samples file.")
12-
}
13-
10+
if(length(args)!=2){
11+
stop("Need a sample file and synapse token as argument. Rscript 00_sample_gen.R [samplefile] [synapse token]")
1412

15-
if (length(args) == 0 || is.na(args[1]) || args[1] == "" || !file.exists(args[1])) {
16-
orig_samples <- ""
17-
} else {
18-
orig_samples <- fread(args[1])
1913
}
2014

15+
orig_samples<-fread(args[1])
2116

22-
# Check if Synapse token is available from the environment
23-
synapse_token <- Sys.getenv("SYNAPSE_AUTH_TOKEN")
24-
if (synapse_token == "") {
25-
stop("Error: SYNAPSE_AUTH_TOKEN environment variable is not set.")
26-
}
27-
28-
synapser::synLogin(authToken=synapse_token)
17+
synapser::synLogin(authToken=args[2])
2918
manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|>
3019
as.data.frame()
3120

@@ -43,18 +32,23 @@ manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|>
4332
##first create samples for the original tumors
4433
tumorTable<-manifest|>
4534
dplyr::select(common_name='Sample')|>
46-
dplyr::mutate(other_id_source='NF Data Portal',other_names='',cancer_type="Malignant peripheral nerve sheath tumor",species='Homo sapiens (Human)',model_type='tumor')|>
35+
dplyr::mutate(other_id_source='NF Data Portal',other_names='',cancer_type="Malignant peripheral nerve sheath tumor",species='Human',model_type='tumor')|>
4736
tidyr::unite(col='other_id',c('common_name','model_type'),sep=' ',remove=FALSE)
4837

4938
##then create samples for the PDX
5039
sampTable<-manifest|>
5140
dplyr::select(common_name='Sample',MicroTissueDrugFolder)|>
52-
dplyr::mutate(other_id_source='NF Data Portal',other_names='',cancer_type="Malignant peripheral nerve sheath tumor",species='Homo sapiens (Human)',model_type='patient derived xenograft')|>
41+
dplyr::mutate(other_id_source='NF Data Portal',other_names='',cancer_type="Malignant peripheral nerve sheath tumor",species='Human',model_type='patient derived xenograft')|>
5342
tidyr::unite(col='other_id',c('common_name','model_type'),sep=' ',remove=FALSE)
5443

5544

45+
pdxmt<-manifest|>
46+
dplyr::select(common_name='Sample',MicroTissueDrugFolder)|>
47+
dplyr::mutate(other_id_source='NF Data Portal',other_names='',cancer_type="Malignant peripheral nerve sheath tumor",species='Human',model_type='organoid')|>
48+
tidyr::unite(col='other_id',c('common_name','model_type'),sep=' ',remove=FALSE)
49+
5650
##third, generate a sample for the MTs if they were generated
57-
pdxmt<-subset(sampTable,!is.na(MicroTissueDrugFolder))
51+
#pdxmt<-subset(sampTable,!is.na(MicroTissueDrugFolder))
5852
pdxmt$model_type=rep('organoid',nrow(pdxmt))
5953
print(pdxmt)
6054

@@ -64,15 +58,7 @@ main<-rbind(sampTable,pdxmt)|>
6458

6559
#main <- fread("mpnst/NF_MPNST_samples.csv")
6660
#previous_aml <- fread(args[1])#"beatAML/beataml_samples.csv")
67-
68-
# If there is no previous samples file - start at 1, else, continue where the previous one left off.
69-
if (identical(orig_samples, "")) {
70-
max_id <- 1
71-
} else {
72-
max_id <- max(orig_samples$improve_sample_id, na.rm = TRUE)
73-
}
74-
75-
61+
max_id <- max(orig_samples$improve_sample_id)
7662
main$improve_sample_id <- seq(from = max_id + 1, length.out = nrow(main))
7763

7864
#synapse_main <- fread("mpnst/synapse_NF-MPNST_samples.csv")

build/mpnst/01_mpnst_get_omics.R

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ samples_df <- fread(patients)|>
3434

3535
pdx_samps<-subset(samples_df,model_type=='patient derived xenograft')
3636
tumor_samps<-subset(samples_df,model_type=='tumor')
37+
mt_samps<-subset(samples_df,model_type=='organoid')
3738

3839
##now get the manifest from synapse
3940
manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|>
@@ -45,14 +46,19 @@ manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|>
4546
##they each get their own sample identifier
4647
pdx_data<-manifest|>dplyr::select(common_name,starts_with("PDX"))|>
4748
left_join(pdx_samps)|>
48-
dplyr::select(improve_sample_id,RNASeq='PDX_RNASeq',Mutations='PDX_Somatic_Mutations',CopyNumber='PDX_CNV',Proteomics='PDX_Proteomics')
49+
dplyr::select(improve_sample_id,common_name,model_type,RNASeq='PDX_RNASeq',Mutations='PDX_Somatic_Mutations',CopyNumber='PDX_CNV',Proteomics='PDX_Proteomics')
4950

5051
tumor_data<- manifest|>dplyr::select(common_name,starts_with("Tumor"))|>
5152
left_join(tumor_samps)|>
52-
dplyr::select(improve_sample_id,RNASeq='Tumor_RNASeq',Mutations='Tumor_Somatic_Mutations',CopyNumber='Tumor_CNV')|>
53+
dplyr::select(improve_sample_id,common_name,model_type,RNASeq='Tumor_RNASeq',Mutations='Tumor_Somatic_Mutations',CopyNumber='Tumor_CNV')|>
5354
mutate(Proteomics='') ##we dont have tumor proteomics from these samples
5455
#print(tumor_data)
5556

57+
mt_data<- manifest|>dplyr::select(common_name,starts_with("PDX"))|>
58+
left_join(mt_samps)|>
59+
dplyr::select(improve_sample_id,common_name,model_type, RNASeq='PDX_RNASeq',Mutations='PDX_Somatic_Mutations',CopyNumber='PDX_CNV',Proteomics='PDX_Proteomics')##we dont have mt data yet, so collecting PDX instead
60+
#print(tumor_data)
61+
5662

5763
combined<-rbind(pdx_data,tumor_data)|>distinct()
5864

@@ -61,10 +67,10 @@ genes_df <- fread(genefile)
6167

6268

6369
##added proteomics first
64-
proteomics<-do.call('rbind',lapply(setdiff(combined$Proteomics,c('',NA,"NA")),function(x){
70+
proteomics<-do.call('rbind',lapply(setdiff(mt_data$Proteomics,c('',NA,"NA")),function(x){
6571
# if(x!=""){
6672
#print(x)
67-
sample<-subset(combined,Proteomics==x)
73+
sample<-subset(mt_data,Proteomics==x)
6874
#print(sample)
6975
res<-fread(synGet(x)$path)|>
7076
#tidyr::separate(Name,into=c('other_id','vers'),sep='\\.')|>
@@ -88,10 +94,10 @@ fwrite(proteomics,'/tmp/mpnst_proteomics.csv.gz')
8894

8995
#### FIRST WE GET RNASeq Data
9096

91-
rnaseq<-do.call('rbind',lapply(setdiff(combined$RNASeq,c(NA,"NA")),function(x){
97+
rnaseq<-do.call('rbind',lapply(setdiff(mt_data$RNASeq,c(NA,"NA")),function(x){
9298
# if(x!=""){
9399
#print(x)
94-
sample<-subset(combined,RNASeq==x)
100+
sample<-subset(mt_data,RNASeq==x)
95101
#print(sample)
96102
res<-fread(synGet(x)$path)|>
97103
tidyr::separate(Name,into=c('other_id','vers'),sep='\\.')|>
@@ -114,11 +120,11 @@ fwrite(rnaseq,'/tmp/mpnst_transcriptomics.csv.gz')
114120

115121
#####NEXT WE DO WES DATA
116122
print("Getting WES")
117-
wes<-do.call(rbind,lapply(setdiff(combined$`Mutations`,c(NA,"NA")),function(x){
123+
wes<-do.call(rbind,lapply(setdiff(mt_data$`Mutations`,c(NA,"NA")),function(x){
118124

119125
x2=x#gsub('"','',gsub("[",'',gsub("]",'',x,fixed=T),fixed=T),fixed=T)
120126
print(x)
121-
sample<-subset(combined,Mutations==x)
127+
sample<-subset(mt_data,Mutations==x)
122128
print(sample$improve_sample_id)
123129
res<-NULL
124130
try(res<-fread(synGet(x2)$path)|>
@@ -141,11 +147,11 @@ fwrite(wes,'/tmp/mpnst_mutations.csv.gz')
141147

142148
print(paste("getting CNV"))
143149
##next let's do CNVs!
144-
cnv<-do.call(rbind,lapply(setdiff(combined$CopyNumber,c(NA,"NA")),function(x){
150+
cnv<-do.call(rbind,lapply(setdiff(mt_data$CopyNumber,c(NA,"NA")),function(x){
145151

146152
x2=x#gsub('"','',gsub("[",'',gsub("]",'',x,fixed=T),fixed=T),fixed=T)
147153
print(x)
148-
sample<-subset(combined,CopyNumber==x)
154+
sample<-subset(mt_data,CopyNumber==x)
149155
print(sample$improve_sample_id)
150156
res<-fread(synGet(x2)$path)
151157

0 commit comments

Comments
 (0)