Skip to content

Commit beeb65d

Browse files
authored
Merge pull request #345 from PNNL-CompBio/sanger_data_fix
fix for #343
2 parents 1252182 + 201b9ef commit beeb65d

13 files changed

Lines changed: 499544 additions & 78 deletions

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
## Cancer Omics Drug Experiment Response Dataset
22

3+
4+
35
There is a recent explosion of deep learning algorithms that to tackle the computational problem of predicting drug treatment outcome from baseline molecular measurements. To support this,we have built a benchmark dataset that harmonizes diverse datasets to better assess algorithm performance.
46

57
This package collects diverse sets of paired molecular datasets with corresponding drug sensitivity data. All data here is reprocessed and standardized so it can be easily used as a benchmark dataset for the

build/broad_sanger/02-broadSangerOmics.R

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -566,12 +566,16 @@ main<-function(){
566566
lapply(alltypes,function(dt){
567567
print(dt)
568568
temps<-sanger_files(sanger_filenames[[dt]],dt)|>tidyr::drop_na()
569+
readr::write_csv(temps,file=paste0('/tmp/sanger_',dt,'.csv.gz'))
569570
tempd<-depmap_files(depmap_filenames[[dt]],dt)|>tidyr::drop_na()
570-
readr::write_csv(rbind(tempd,temps),file=paste0('/tmp/broad_sanger_',dt,'.csv.gz'))
571+
readr::write_csv(tempd,file=paste0('/tmp/broad_',dt,'.csv.gz'))
572+
573+
# readr::write_csv(rbind(tempd,temps),file=paste0('/tmp/broad_sanger_',dt,'.csv.gz'))
571574
rm(tempd)
572575
rm(temps)
573576
})
574577

575578
}
576579

577580
main()
581+

build/broad_sanger/02a-broad_sanger_proteomics.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ def main():
4848

4949
full[['study']] = 'DepMap'
5050
full[['source']] = 'Broad'
51+
##now save to separate files
52+
full.dropna(axis=0)
53+
full.to_csv('/tmp/broad_proteomics.csv.gz', index=False, compression='gzip')
5154

5255

5356
##now get sanger
@@ -69,9 +72,9 @@ def main():
6972
full2.loc[:,['study']] = 'Sanger'
7073
full2.loc[:,['source']] = 'Sanger'
7174

72-
full3 = pd.concat([full,full2])
73-
print(full3)
74-
full3.dropna(axis=0)
75-
full3.to_csv('/tmp/broad_sanger_proteomics.csv.gz',index=False, compression='gzip')
75+
#full3 = pd.concat([full,full2])
76+
#print(full3)
77+
full2.dropna(axis=0)
78+
full2.to_csv('/tmp/sanger_proteomics.csv.gz',index=False, compression='gzip')
7679

7780
main()

build/broad_sanger/05b_separate_datasets.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@
55
import shutil
66

77
def main():
8+
9+
print("ls:\n")
10+
files = os.listdir(".")
11+
print(files)
12+
print("\n")
13+
814
datasets_to_process = ["CCLE", "CTRPv2", "PRISM", "GDSCv1", "GDSCv2", "FIMM", "gCSI", "NCI60"]
915
omics_datatypes = ["transcriptomics","proteomics", "copy_number","mutations"] # csv
1016
samples_datatypes = ["samples"] #csv
@@ -13,14 +19,14 @@ def main():
1319

1420

1521
dataset_sources = {
16-
"CCLE": ["Broad"],
17-
"CTRPv2": ["Broad"],
18-
"PRISM": ["Broad"],
19-
"GDSCv1": ["Sanger"],
20-
"GDSCv2": ["Sanger"],
21-
"FIMM": ["Broad"],
22-
"gCSI": ["Broad"], # gCSI generates its own omics data but it is comparable to CCLE. In future, retrive gCSI omics.
23-
"NCI60": ["Broad"]
22+
"CCLE": ["broad"],
23+
"CTRPv2": ["broad"],
24+
"PRISM": ["broad"],
25+
"GDSCv1": ["sanger"],
26+
"GDSCv2": ["sanger"],
27+
"FIMM": ["broad"],
28+
"gCSI": ["broad"], # gCSI generates its own omics data but it is comparable to CCLE. In future, retrive gCSI omics.
29+
"NCI60": ["broad"]
2430
}
2531

2632
for dataset in datasets_to_process:
@@ -70,14 +76,15 @@ def main():
7076

7177
#One by one, filter other Omics files, write to file, delete from mem.
7278
for omics in omics_datatypes:
73-
omics_filename_in = f"broad_sanger_{omics}.csv"
79+
ds = dataset_sources[dataset][0]
80+
omics_filename_in = f"{ds}_{omics}.csv"
7481
if os.path.isfile(omics_filename_in + ".gz"):
7582
omics_filename_in += ".gz"
7683

7784
omics_filename_out = f"/tmp/{dataset}_{omics}.csv".lower()
7885
omics_df = pl.read_csv(omics_filename_in)
7986
omics_df = omics_df.filter(pl.col("improve_sample_id").is_in(exp_improve_sample_ids))
80-
omics_df = omics_df.filter(pl.col("source").is_in(dataset_sources[dataset]))
87+
# omics_df = omics_df.filter(pl.col("source").is_in(dataset_sources[dataset]))
8188
omics_df.write_csv(omics_filename_out) #csv
8289

8390
#Rewrite as gzipped if needed

build/broad_sanger/build_misc.sh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,12 @@ set -euo pipefail
33

44
trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
55

6-
cp /tmp/broad_sanger* .
6+
cp /tmp/broad* .
7+
cp /tmp/sanger* .
78

89
echo "Running 05a_remove_problem_drugs.py..."
910
/opt/venv/bin/python 05a_remove_problem_drugs.py
1011

1112
echo "Running 05b_separate_datasets.py..."
1213
/opt/venv/bin/python 05b_separate_datasets.py
1314

14-
echo "Removing broad_sanger* files..."
15-
rm broad_sanger*

build/build_all.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def main():
4040
parser.add_argument('--figshare', action='store_true', help="Upload all local data to Figshare. FIGSHARE_TOKEN must be set in local environment.")
4141
parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate or figshare commands")
4242
parser.add_argument('--high_mem',dest='high_mem',default=False,action='store_true',help = "If you have 32 or more CPUs, this option is recommended. It will run many code portions in parallel. If you don't have enough memory, this will cause a run failure.")
43-
parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx,pancpdo',help='Datasets to process. Defaults to all available.')
43+
parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx,pancpdo,bladderpdo',help='Datasets to process. Defaults to all available.')
4444
parser.add_argument('--version', type=str, required=False, help='Version number for the Figshare upload title (e.g., "0.1.29"). This is required for Figshare upload. This must be a higher version than previously published versions.')
4545
parser.add_argument('--github-username', type=str, required=False, help='GitHub username for the repository.')
4646
parser.add_argument('--github-email', type=str, required=False, help='GitHub email for the repository.')
@@ -121,6 +121,7 @@ def process_docker(datasets):
121121
'mpnst': ['mpnst'],
122122
'mpnstpdx': ['mpnstpdx'],
123123
'pancpdo': ['pancpdo'],
124+
'bladderpdo': ['bladderpdo'],
124125
'cptac': ['cptac'],
125126
'genes': ['genes'],
126127
'upload': ['upload']
@@ -132,7 +133,7 @@ def process_docker(datasets):
132133
datasets_to_build.extend(dataset_map.get(dataset, []))
133134

134135
# Build the docker-compose command, adding specific datasets
135-
compose_command = ['docker', 'compose', '-f', compose_file, 'build', '--parallel'] + datasets_to_build
136+
compose_command = ['docker-compose', '-f', compose_file, 'build', '--parallel'] + datasets_to_build
136137

137138
log_file_path = 'local/docker.log'
138139
env = os.environ.copy()
@@ -328,7 +329,7 @@ def get_latest_commit_hash(owner, repo, branch='main'):
328329
# Error handling for required tokens
329330
if args.figshare and not figshare_token:
330331
raise ValueError("FIGSHARE_TOKEN environment variable is not set.")
331-
if ('beataml' in args.datasets or 'mpnst' in args.datasets) and not synapse_auth_token:
332+
if any(dataset in args.datasets for dataset in ['beataml', 'mpnst', 'bladderpdo', 'pancpdo']) and not synapse_auth_token:
332333
if args.docker or args.samples or args.omics or args.drugs or args.exp or args.all: # Token only required if building data, not upload or validate.
333334
raise ValueError("SYNAPSE_AUTH_TOKEN is required for accessing MPNST and beatAML datasets.")
334335

@@ -407,7 +408,7 @@ def get_latest_commit_hash(owner, repo, branch='main'):
407408
# if args.figshare or args.validate:
408409
# FigShare File Prefixes:
409410

410-
prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'genes', 'drugs']
411+
prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'mpnstpdx', 'pancpdo', 'bladderpdo', 'genes', 'drugs']
411412
broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"]
412413
if "broad_sanger" in datasets:
413414
prefixes.extend(broad_sanger_datasets)

build/build_dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def process_docker(dataset,validate):
5757

5858
datasets_to_build.extend(dataset_map.get(dataset, []))
5959

60-
compose_command = ['docker','compose', '-f', compose_file, 'build'] + datasets_to_build
60+
compose_command = ['docker-compose', '-f', compose_file, 'build'] + datasets_to_build
6161

6262
log_file_path = 'local/docker.log'
6363
env = os.environ.copy()

build/cptac/getCptacData.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -288,8 +288,9 @@ def main():
288288
exit()
289289

290290
# Remove the old values in samples (from prev file)
291-
samples.drop(samples.index,inplace=True)
292-
291+
if 'other_id_source' in samples.columns:
292+
samples = samples[samples['other_id_source'] == 'CPTAC3'].copy()
293+
293294
# Create new samples
294295
if build_samples:
295296
# Loop through the cancer types to build samples

build/docker/Dockerfile.broad_sanger_exp

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,32 @@
11
FROM r-base:4.4.1
2-
ENV DEBIAN_FRONTEND=noninteractive
3-
RUN apt-get update --fix-missing
4-
#RUN apt-get install -y --fix-missing --allow-unauthenticated build-essential libpq-dev python3.10 python3-pip python3-setuptools python3-dev python3-venv libcurl4-openssl-dev libxml2-dev libglpk-dev
5-
6-
# RUN apt-get install -y --fix-missing --allow-unauthenticated build-essential python3-pip python3-setuptools python3-dev python3-venv libcurl4-openssl-dev libglpk-dev libxml2-dev libpq-dev
7-
8-
RUN apt-get install -y --fix-missing --allow-unauthenticated \
9-
build-essential \
10-
python3-pip \
11-
python3-setuptools \
12-
python3-dev \
13-
python3-venv \
14-
libcurl4-openssl-dev \
15-
libglpk-dev \
16-
libxml2-dev \
17-
libpq-dev \
18-
ca-certificates
19-
20-
RUN python3 -m venv /opt/venv
21-
RUN /opt/venv/bin/pip3 install --upgrade pip
222

3+
ENV DEBIAN_FRONTEND=noninteractive
234

24-
# Set MPLCONFIGDIR to a writable directory
5+
# Update and upgrade packages, then install required packages.
6+
RUN apt-get update --fix-missing && \
7+
apt-get upgrade -y && \
8+
apt-get install -y --fix-missing --allow-unauthenticated \
9+
build-essential \
10+
python3-pip \
11+
python3-setuptools \
12+
python3-dev \
13+
python3-venv \
14+
libcurl4-openssl-dev \
15+
libglpk-dev \
16+
libxml2-dev \
17+
libpq-dev \
18+
ca-certificates && \
19+
apt-get clean && rm -rf /var/lib/apt/lists/*
20+
21+
# Create and upgrade the Python virtual environment.
22+
RUN python3 -m venv /opt/venv && \
23+
/opt/venv/bin/pip install --upgrade pip
24+
25+
# Set MPLCONFIGDIR to a writable directory and create it.
2526
ENV MPLCONFIGDIR=/app/tmp/matplotlib
2627
RUN mkdir -p /app/tmp/matplotlib
2728

28-
29+
# Set Python path and working directory.
2930
ENV PYTHONPATH "${PYTHONPATH}:/app"
3031
WORKDIR /app
3132

Lines changed: 28 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,36 @@
11
FROM r-base:4.4.1
2+
23
ENV DEBIAN_FRONTEND=noninteractive
3-
RUN apt-get update --fix-missing
4-
5-
# RUN apt-get install -y --fix-missing --allow-unauthenticated build-essential python3-pip python3-setuptools python3-dev python3-venv libcurl4-openssl-dev libglpk-dev libxml2-dev libpq-dev
6-
7-
RUN apt-get install -y --fix-missing --allow-unauthenticated \
8-
build-essential \
9-
python3-pip \
10-
python3-setuptools \
11-
python3-dev \
12-
python3-venv \
13-
libcurl4-openssl-dev \
14-
libglpk-dev \
15-
libxml2-dev \
16-
libpq-dev \
17-
ca-certificates
18-
19-
RUN python3 -m venv /opt/venv
20-
RUN /opt/venv/bin/pip3 install --upgrade pip
21-
22-
# Set MPLCONFIGDIR to a writable directory
4+
5+
# Update and upgrade packages, then install required packages.
6+
RUN apt-get update --fix-missing && \
7+
apt-get upgrade -y && \
8+
apt-get install -y --fix-missing --allow-unauthenticated \
9+
build-essential \
10+
python3-pip \
11+
python3-setuptools \
12+
python3-dev \
13+
python3-venv \
14+
libcurl4-openssl-dev \
15+
libglpk-dev \
16+
libxml2-dev \
17+
libpq-dev \
18+
ca-certificates && \
19+
apt-get clean && rm -rf /var/lib/apt/lists/*
20+
21+
# Create and upgrade the Python virtual environment.
22+
RUN python3 -m venv /opt/venv && \
23+
/opt/venv/bin/pip install --upgrade pip
24+
25+
# Set MPLCONFIGDIR to a writable directory and create it.
2326
ENV MPLCONFIGDIR=/app/tmp/matplotlib
2427
RUN mkdir -p /app/tmp/matplotlib
2528

26-
29+
# Set Python path and working directory.
2730
ENV PYTHONPATH "${PYTHONPATH}:/app"
2831
WORKDIR /app
2932

33+
# Add application files.
3034
ADD build/broad_sanger/01-broadSangerSamples.R ./
3135
ADD build/broad_sanger/02-broadSangerOmics.R ./
3236
ADD build/broad_sanger/02a-broad_sanger_proteomics.py ./
@@ -40,14 +44,8 @@ ADD build/broad_sanger/05b_separate_datasets.py ./
4044
ADD build/broad_sanger/requirements.txt .
4145
ADD build/broad_sanger/omics_requirements.r .
4246

43-
# installing r libraries
47+
# Install R libraries.
4448
RUN Rscript omics_requirements.r
4549

46-
# installing python libraries
47-
RUN /opt/venv/bin/pip3 install -r requirements.txt
48-
49-
50-
51-
52-
53-
50+
# Install Python libraries.
51+
RUN /opt/venv/bin/pip install -r requirements.txt

0 commit comments

Comments
 (0)