Skip to content

Commit 2a4e860

Browse files
committed
Merge branch 'main' into bladder_pdo
2 parents e24b429 + 0bfce9a commit 2a4e860

35 files changed

Lines changed: 535242 additions & 386 deletions

LICENSE renamed to LICENSE_DISCLAIMER

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
Copyright Battelle Memorial Institute
1+
Copyright Battelle Memorial Institute 2025
22

33
Redistribution and use in source and binary forms, with or without
44
modification, are permitted provided that the following conditions are met:

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
## Cancer Omics Drug Experiment Response Dataset
22

3+
4+
35
There is a recent explosion of deep learning algorithms that to tackle the computational problem of predicting drug treatment outcome from baseline molecular measurements. To support this,we have built a benchmark dataset that harmonizes diverse datasets to better assess algorithm performance.
46

57
This package collects diverse sets of paired molecular datasets with corresponding drug sensitivity data. All data here is reprocessed and standardized so it can be easily used as a benchmark dataset for the

build/beatAML/GetBeatAML.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -653,7 +653,7 @@ def generate_drug_list(drug_map_path,drug_path):
653653
# New Transcriptomics Data
654654
print("Starting Transcriptomics Data")
655655
##first run conversion tool
656-
os.system("python tpmFromCounts.py --counts "+transcriptomics_file)
656+
os.system("python tpmFromCounts.py --counts {} --out_file {}".format(transcriptomics_file,'tpm_'+transcriptomics_file))
657657

658658

659659
t_df = pd.read_csv('tpm_'+transcriptomics_file, sep = '\t')

build/broad_sanger/02-broadSangerOmics.R

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -566,12 +566,16 @@ main<-function(){
566566
lapply(alltypes,function(dt){
567567
print(dt)
568568
temps<-sanger_files(sanger_filenames[[dt]],dt)|>tidyr::drop_na()
569+
readr::write_csv(temps,file=paste0('/tmp/sanger_',dt,'.csv.gz'))
569570
tempd<-depmap_files(depmap_filenames[[dt]],dt)|>tidyr::drop_na()
570-
readr::write_csv(rbind(tempd,temps),file=paste0('/tmp/broad_sanger_',dt,'.csv.gz'))
571+
readr::write_csv(tempd,file=paste0('/tmp/broad_',dt,'.csv.gz'))
572+
573+
# readr::write_csv(rbind(tempd,temps),file=paste0('/tmp/broad_sanger_',dt,'.csv.gz'))
571574
rm(tempd)
572575
rm(temps)
573576
})
574577

575578
}
576579

577580
main()
581+

build/broad_sanger/02a-broad_sanger_proteomics.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ def main():
4848

4949
full[['study']] = 'DepMap'
5050
full[['source']] = 'Broad'
51+
##now save to separate files
52+
full.dropna(axis=0)
53+
full.to_csv('/tmp/broad_proteomics.csv.gz', index=False, compression='gzip')
5154

5255

5356
##now get sanger
@@ -69,9 +72,9 @@ def main():
6972
full2.loc[:,['study']] = 'Sanger'
7073
full2.loc[:,['source']] = 'Sanger'
7174

72-
full3 = pd.concat([full,full2])
73-
print(full3)
74-
full3.dropna(axis=0)
75-
full3.to_csv('/tmp/broad_sanger_proteomics.csv.gz',index=False, compression='gzip')
75+
#full3 = pd.concat([full,full2])
76+
#print(full3)
77+
full2.dropna(axis=0)
78+
full2.to_csv('/tmp/sanger_proteomics.csv.gz',index=False, compression='gzip')
7679

7780
main()

build/broad_sanger/05b_separate_datasets.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@
55
import shutil
66

77
def main():
8+
9+
print("ls:\n")
10+
files = os.listdir(".")
11+
print(files)
12+
print("\n")
13+
814
datasets_to_process = ["CCLE", "CTRPv2", "PRISM", "GDSCv1", "GDSCv2", "FIMM", "gCSI", "NCI60"]
915
omics_datatypes = ["transcriptomics","proteomics", "copy_number","mutations"] # csv
1016
samples_datatypes = ["samples"] #csv
@@ -13,14 +19,14 @@ def main():
1319

1420

1521
dataset_sources = {
16-
"CCLE": ["Broad"],
17-
"CTRPv2": ["Broad"],
18-
"PRISM": ["Broad"],
19-
"GDSCv1": ["Sanger"],
20-
"GDSCv2": ["Sanger"],
21-
"FIMM": ["Broad"],
22-
"gCSI": ["Broad"], # gCSI generates its own omics data but it is comparable to CCLE. In future, retrive gCSI omics.
23-
"NCI60": ["Broad"]
22+
"CCLE": ["broad"],
23+
"CTRPv2": ["broad"],
24+
"PRISM": ["broad"],
25+
"GDSCv1": ["sanger"],
26+
"GDSCv2": ["sanger"],
27+
"FIMM": ["broad"],
28+
"gCSI": ["broad"], # gCSI generates its own omics data but it is comparable to CCLE. In future, retrive gCSI omics.
29+
"NCI60": ["broad"]
2430
}
2531

2632
for dataset in datasets_to_process:
@@ -70,14 +76,15 @@ def main():
7076

7177
#One by one, filter other Omics files, write to file, delete from mem.
7278
for omics in omics_datatypes:
73-
omics_filename_in = f"broad_sanger_{omics}.csv"
79+
ds = dataset_sources[dataset][0]
80+
omics_filename_in = f"{ds}_{omics}.csv"
7481
if os.path.isfile(omics_filename_in + ".gz"):
7582
omics_filename_in += ".gz"
7683

7784
omics_filename_out = f"/tmp/{dataset}_{omics}.csv".lower()
7885
omics_df = pl.read_csv(omics_filename_in)
7986
omics_df = omics_df.filter(pl.col("improve_sample_id").is_in(exp_improve_sample_ids))
80-
omics_df = omics_df.filter(pl.col("source").is_in(dataset_sources[dataset]))
87+
# omics_df = omics_df.filter(pl.col("source").is_in(dataset_sources[dataset]))
8188
omics_df.write_csv(omics_filename_out) #csv
8289

8390
#Rewrite as gzipped if needed

build/broad_sanger/build_misc.sh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,12 @@ set -euo pipefail
33

44
trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
55

6-
cp /tmp/broad_sanger* .
6+
cp /tmp/broad* .
7+
cp /tmp/sanger* .
78

89
echo "Running 05a_remove_problem_drugs.py..."
910
/opt/venv/bin/python 05a_remove_problem_drugs.py
1011

1112
echo "Running 05b_separate_datasets.py..."
1213
/opt/venv/bin/python 05b_separate_datasets.py
1314

14-
echo "Removing broad_sanger* files..."
15-
rm broad_sanger*

build/build_all.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def main():
4040
parser.add_argument('--figshare', action='store_true', help="Upload all local data to Figshare. FIGSHARE_TOKEN must be set in local environment.")
4141
parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate or figshare commands")
4242
parser.add_argument('--high_mem',dest='high_mem',default=False,action='store_true',help = "If you have 32 or more CPUs, this option is recommended. It will run many code portions in parallel. If you don't have enough memory, this will cause a run failure.")
43-
parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx,pancpdo',help='Datasets to process. Defaults to all available.')
43+
parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx,pancpdo,bladderpdo,sarcpdo',help='Datasets to process. Defaults to all available.')
4444
parser.add_argument('--version', type=str, required=False, help='Version number for the Figshare upload title (e.g., "0.1.29"). This is required for Figshare upload. This must be a higher version than previously published versions.')
4545
parser.add_argument('--github-username', type=str, required=False, help='GitHub username for the repository.')
4646
parser.add_argument('--github-email', type=str, required=False, help='GitHub email for the repository.')
@@ -62,7 +62,7 @@ def run_docker_cmd(cmd_arr,filename):
6262
print('running...'+filename)
6363
env = os.environ.copy()
6464
if 'SYNAPSE_AUTH_TOKEN' not in env.keys():
65-
print('You need to set the SYNAPSE_AUTH_TOKEN to acess the MPNST and beatAML Datasets')
65+
print('You need to set the SYNAPSE_AUTH_TOKEN to acess the MPNST, beatAML, bladderpdo, pancpdo, or sarcpdo datasets')
6666
docker_run = ['docker','run','--rm','-v',env['PWD']+'/local/:/tmp/','--platform=linux/amd64']
6767
else:
6868
docker_run = ['docker','run','--rm','-v',env['PWD']+'/local/:/tmp/','-e','SYNAPSE_AUTH_TOKEN='+env['SYNAPSE_AUTH_TOKEN'],'--platform=linux/amd64']
@@ -121,6 +121,8 @@ def process_docker(datasets):
121121
'mpnst': ['mpnst'],
122122
'mpnstpdx': ['mpnstpdx'],
123123
'pancpdo': ['pancpdo'],
124+
'bladderpdo': ['bladderpdo'],
125+
'sarcpdo': ['sarcpdo'],
124126
'cptac': ['cptac'],
125127
'genes': ['genes'],
126128
'upload': ['upload']
@@ -132,7 +134,7 @@ def process_docker(datasets):
132134
datasets_to_build.extend(dataset_map.get(dataset, []))
133135

134136
# Build the docker-compose command, adding specific datasets
135-
compose_command = ['docker', 'compose', '-f', compose_file, 'build', '--parallel'] + datasets_to_build
137+
compose_command = ['docker-compose', '-f', compose_file, 'build', '--parallel'] + datasets_to_build
136138

137139
log_file_path = 'local/docker.log'
138140
env = os.environ.copy()
@@ -328,9 +330,9 @@ def get_latest_commit_hash(owner, repo, branch='main'):
328330
# Error handling for required tokens
329331
if args.figshare and not figshare_token:
330332
raise ValueError("FIGSHARE_TOKEN environment variable is not set.")
331-
if ('beataml' in args.datasets or 'mpnst' in args.datasets) and not synapse_auth_token:
333+
if any(dataset in args.datasets for dataset in ['beataml', 'mpnst', 'bladderpdo', 'pancpdo','sarcpdo']) and not synapse_auth_token:
332334
if args.docker or args.samples or args.omics or args.drugs or args.exp or args.all: # Token only required if building data, not upload or validate.
333-
raise ValueError("SYNAPSE_AUTH_TOKEN is required for accessing MPNST and beatAML datasets.")
335+
raise ValueError("SYNAPSE_AUTH_TOKEN is required for accessing MPNST, beatAML, bladderpdo, pancpdo, or sarcpdo datasets.")
334336

335337
######
336338
### Begin Pipeline
@@ -407,7 +409,7 @@ def get_latest_commit_hash(owner, repo, branch='main'):
407409
# if args.figshare or args.validate:
408410
# FigShare File Prefixes:
409411

410-
prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'genes', 'drugs']
412+
prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'mpnstpdx', 'pancpdo', 'bladderpdo','sarcpdo', 'genes', 'drugs']
411413
broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"]
412414
if "broad_sanger" in datasets:
413415
prefixes.extend(broad_sanger_datasets)

build/build_dataset.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def process_docker(dataset,validate):
4444
'mpnstpdx': ['mpnstpdx'],
4545
'pancpdo': ['pancpdo'],
4646
'cptac': ['cptac'],
47+
'sarcpdo': ['sarcpdo'],
4748
'genes': ['genes'],
4849
'upload': ['upload'],
4950
'bladderpdo': ['bladderpdo']
@@ -57,7 +58,7 @@ def process_docker(dataset,validate):
5758

5859
datasets_to_build.extend(dataset_map.get(dataset, []))
5960

60-
compose_command = ['docker','compose', '-f', compose_file, 'build'] + datasets_to_build
61+
compose_command = ['docker-compose', '-f', compose_file, 'build'] + datasets_to_build
6162

6263
log_file_path = 'local/docker.log'
6364
env = os.environ.copy()
@@ -125,8 +126,9 @@ def process_omics(executor, dataset, should_continue):
125126
'broad_sanger': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
126127
'cptac': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
127128
'hcmi': ['mutations', 'transcriptomics'],
129+
'mpnstpdx':['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
130+
'sarcpdo': ['mutations', 'transcriptomics'],
128131
'pancpdo': ['transcriptomics'],
129-
'mpnstpdx':['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
130132
'bladderpdo': ['copy_number', 'mutations', 'transcriptomics']
131133
}
132134

build/cptac/getCptacData.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -288,8 +288,9 @@ def main():
288288
exit()
289289

290290
# Remove the old values in samples (from prev file)
291-
samples.drop(samples.index,inplace=True)
292-
291+
if 'other_id_source' in samples.columns:
292+
samples = samples[samples['other_id_source'] == 'CPTAC3'].copy()
293+
293294
# Create new samples
294295
if build_samples:
295296
# Loop through the cancer types to build samples

0 commit comments

Comments
 (0)