Skip to content

Commit 51adfeb

Browse files
committed
Moved over files from main to enable build test
1 parent ea9be4a commit 51adfeb

12 files changed

Lines changed: 2279 additions & 38 deletions

build.out

Lines changed: 1721 additions & 0 deletions
Large diffs are not rendered by default.

build/build_dataset.py

Lines changed: 321 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,321 @@
1+
"""
2+
Script that builds a single dataset.
3+
"""
4+
5+
import os
6+
import argparse
7+
import subprocess
8+
import shutil
9+
import gzip
10+
from concurrent.futures import ThreadPoolExecutor
11+
import glob
12+
13+
def run_docker_cmd(cmd_arr, filename):
14+
'''
15+
Wrapper for 'docker run' command. Executes a Docker container with the specified command.
16+
'''
17+
print('Running...', filename)
18+
env = os.environ.copy()
19+
if 'SYNAPSE_AUTH_TOKEN' not in env:
20+
print('You need to set the SYNAPSE_AUTH_TOKEN to access the MPNST and beatAML datasets')
21+
docker_run = ['docker', 'run', '-v', f"{env['PWD']}/local/:/tmp/", '--platform=linux/amd64']
22+
else:
23+
docker_run = ['docker', 'run', '-v', f"{env['PWD']}/local/:/tmp/", '-e', f"SYNAPSE_AUTH_TOKEN={env['SYNAPSE_AUTH_TOKEN']}", '--platform=linux/amd64']
24+
25+
cmd = docker_run + cmd_arr
26+
print('Executing command:', ' '.join(cmd))
27+
res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
28+
if res.returncode != 0:
29+
print(res.stderr.decode())
30+
exit(f'{filename} failed')
31+
else:
32+
print(f'{filename} completed successfully')
33+
34+
def process_docker(dataset,validate):
35+
'''
36+
Build Docker images required for the specified dataset.
37+
'''
38+
compose_file = 'build/docker/docker-compose.yml'
39+
dataset_map = {
40+
'broad_sanger': ['broad_sanger_exp', 'broad_sanger_omics'],
41+
'hcmi': ['hcmi'],
42+
'beataml': ['beataml'],
43+
'mpnst': ['mpnst'],
44+
'mpnstpdx': ['mpnstpdx'],
45+
'cptac': ['cptac'],
46+
'genes': ['genes'],
47+
'upload': ['upload']
48+
}
49+
50+
# Collect container names to build based on the dataset provided. Always build 'genes'.
51+
datasets_to_build = ['genes']
52+
# Append upload if validation step is included
53+
if validate is True:
54+
datasets_to_build.append('upload')
55+
56+
datasets_to_build.extend(dataset_map.get(dataset, []))
57+
58+
compose_command = ['docker-compose', '-f', compose_file, 'build'] + datasets_to_build
59+
60+
log_file_path = 'local/docker.log'
61+
env = os.environ.copy()
62+
63+
print(f"Docker-compose is building images for {', '.join(datasets_to_build)}. View output in {log_file_path}.")
64+
65+
with open(log_file_path, 'w') as log_file:
66+
try:
67+
subprocess.run(compose_command, env=env, stdout=log_file, stderr=log_file, text=True, check=True)
68+
log_file.write("Docker images built successfully.\n")
69+
print(f"Docker images for {', '.join(datasets_to_build)} built successfully. Details logged in {log_file_path}.")
70+
except subprocess.CalledProcessError as e:
71+
log_file.write(f"Docker compose build failed with error: {e}\n")
72+
print(f"Docker compose build failed. See {log_file_path} for details.")
73+
raise
74+
75+
def process_genes(executor):
76+
'''
77+
Build the genes file if it does not exist.
78+
'''
79+
if not os.path.exists('local/genes.csv'):
80+
executor.submit(run_docker_cmd, ['genes', 'bash', 'build_genes.sh'], 'genes file')
81+
82+
def process_samples(executor, dataset, use_prev_dataset, should_continue):
83+
'''
84+
Build the samples file for the specified dataset.
85+
'''
86+
samples_file = f'local/{dataset}_samples.csv'
87+
if should_continue and os.path.exists(samples_file):
88+
print(f"Samples file for {dataset} already exists. Skipping samples build.")
89+
return
90+
91+
prev_samples_file = f'/tmp/{use_prev_dataset}_samples.csv' if use_prev_dataset else ''
92+
di = 'broad_sanger_omics' if dataset == 'broad_sanger' else dataset
93+
filename = f'{dataset} samples'
94+
executor.submit(run_docker_cmd, [di, 'bash', 'build_samples.sh', prev_samples_file], filename)
95+
96+
def process_drugs(executor, dataset, use_prev_dataset, should_continue):
97+
'''
98+
Build the drugs file for the specified dataset.
99+
'''
100+
if dataset in ['cptac', 'hcmi']:
101+
return # No drugs to process for these datasets
102+
103+
drugs_file = f'local/{dataset}_drugs.tsv'
104+
if should_continue and os.path.exists(drugs_file):
105+
print(f"Drugs file for {dataset} already exists. Skipping drugs build.")
106+
return
107+
108+
prev_drugs_file = f'/tmp/{use_prev_dataset}_drugs.tsv' if use_prev_dataset else ''
109+
dflist = [prev_drugs_file] if use_prev_dataset else []
110+
di = 'broad_sanger_exp' if dataset == 'broad_sanger' else dataset
111+
filename = f'{dataset} drugs'
112+
executor.submit(run_docker_cmd, [di, 'bash', 'build_drugs.sh', ','.join(dflist)], filename)
113+
114+
115+
def process_omics(executor, dataset, should_continue):
116+
'''
117+
Build the omics files for the specified dataset.
118+
'''
119+
# Map datasets to their expected omics files
120+
dataset_omics_files = {
121+
'beataml': ['mutations', 'proteomics', 'transcriptomics'],
122+
'mpnst': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
123+
'broad_sanger': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
124+
'cptac': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
125+
'hcmi': ['mutations', 'transcriptomics'],
126+
'mpnstpdx':['copy_number', 'mutations', 'proteomics', 'transcriptomics']
127+
}
128+
129+
expected_omics = dataset_omics_files.get(dataset, [])
130+
131+
if not expected_omics:
132+
print(f"No omics data expected for dataset {dataset}. Skipping omics build.")
133+
return
134+
135+
# Check if all expected omics files exist
136+
omics_files_exist = True
137+
for omics_type in expected_omics:
138+
patterns = [
139+
f'local/{dataset}_{omics_type}.csv',
140+
f'local/{dataset}_{omics_type}.csv.gz',
141+
f'local/{dataset}_{omics_type}.tsv',
142+
f'local/{dataset}_{omics_type}.tsv.gz'
143+
]
144+
file_found = False
145+
for pattern in patterns:
146+
matches = glob.glob(pattern)
147+
if matches:
148+
file_found = True
149+
break
150+
if not file_found:
151+
omics_files_exist = False
152+
break # If any omics files are missing, just build / rebuild them all.
153+
154+
if should_continue and omics_files_exist:
155+
print(f"Omics files for {dataset} already exist. Skipping omics build.")
156+
return
157+
158+
di = 'broad_sanger_omics' if dataset == 'broad_sanger' else dataset
159+
filename = f'{dataset} omics'
160+
executor.submit(run_docker_cmd, [di, 'bash', 'build_omics.sh', '/tmp/genes.csv', f'/tmp/{dataset}_samples.csv'], filename)
161+
162+
163+
def process_experiments(executor, dataset, should_continue):
164+
'''
165+
Build the experiments files for the specified dataset.
166+
'''
167+
if dataset in ['cptac', 'hcmi']:
168+
return # No experiments to process for these datasets
169+
170+
experiments_file = f'local/{dataset}_experiments.tsv'
171+
if should_continue and os.path.exists(experiments_file):
172+
print(f"Experiments file for {dataset} already exists. Skipping experiments build.")
173+
return
174+
175+
di = 'broad_sanger_exp' if dataset == 'broad_sanger' else dataset
176+
filename = f'{dataset} experiments'
177+
executor.submit(run_docker_cmd, [di, 'bash', 'build_exp.sh', f'/tmp/{dataset}_samples.csv', f'/tmp/{dataset}_drugs.tsv'], filename)
178+
179+
180+
181+
def process_misc(executor, datasets):
182+
'''
183+
Run all misc scripts concurrently or one at a time.
184+
'''
185+
last_misc_future = None
186+
#Currently this only applies to broad_sanger. Add others here if they need a final step.
187+
if "broad_sanger" in datasets:
188+
datasets = ["broad_sanger"]
189+
else:
190+
return
191+
for da in datasets:
192+
di = 'broad_sanger_omics' if da == 'broad_sanger' else da
193+
#Run all at once:
194+
if last_misc_future:
195+
last_misc_future.result()
196+
last_misc_future = executor.submit(run_docker_cmd, [di, 'bash', 'build_misc.sh'], f'{da} misc')
197+
198+
199+
200+
def decompress_file(file_path):
201+
"""Decompress a gzip file and delete the original compressed file."""
202+
with gzip.open(file_path, 'rb') as f_in:
203+
decompressed_file_path = file_path[:-3] # Remove '.gz' from the filename
204+
with open(decompressed_file_path, 'wb') as f_out:
205+
shutil.copyfileobj(f_in, f_out)
206+
os.remove(file_path)
207+
208+
def compress_file(file_path):
209+
"""Compress a file using gzip and delete the original uncompressed file."""
210+
compressed_file_path = file_path + '.gz'
211+
with open(file_path, 'rb') as f_in:
212+
with gzip.open(compressed_file_path, 'wb') as f_out:
213+
shutil.copyfileobj(f_in, f_out)
214+
os.remove(file_path)
215+
216+
def run_docker_validate_cmd(cmd_arr, all_files_dir, name):
217+
'''
218+
Wrapper for 'docker run' command used during validation and uploads.
219+
'''
220+
env = os.environ.copy()
221+
docker_run = ['docker', 'run', '-v', f"{env['PWD']}/local/{all_files_dir}:/tmp"]
222+
docker_run.extend(['upload'])
223+
docker_run.extend(cmd_arr)
224+
print('Executing:', ' '.join(docker_run))
225+
res = subprocess.run(docker_run, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
226+
if res.returncode != 0:
227+
print(res.stderr.decode())
228+
exit(f'{name} failed')
229+
else:
230+
print(f'{name} completed successfully')
231+
232+
def run_schema_checker(dataset):
233+
'''
234+
Run schema checker on the built files for the specified dataset.
235+
'''
236+
# Prepare the directory with the built files
237+
prefixes = ['genes', dataset]
238+
datasets = [dataset]
239+
broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"]
240+
all_files_dir = 'all_files_dir'
241+
if "broad_sanger" == dataset:
242+
prefixes.extend(broad_sanger_datasets)
243+
datasets.extend(broad_sanger_datasets)
244+
datasets.remove("broad_sanger")
245+
prefixes.remove("broad_sanger")
246+
247+
if not os.path.exists(f'local/{all_files_dir}'):
248+
os.makedirs(f'local/{all_files_dir}')
249+
250+
# Move relevant files to all_files_dir
251+
for file in os.listdir('local'):
252+
if any(file.startswith(prefix) for prefix in prefixes):
253+
shutil.move(os.path.join('local', file), os.path.join('local', all_files_dir, file))
254+
255+
# Decompress any compressed files
256+
for file in os.listdir(f'local/{all_files_dir}'):
257+
if file.endswith('.gz'):
258+
decompress_file(os.path.join('local', all_files_dir, file))
259+
260+
# Run schema checker
261+
schema_check_command = ['python3', 'scripts/check_schema.py', '--datasets'] + datasets
262+
run_docker_validate_cmd(schema_check_command, all_files_dir, 'Validation')
263+
264+
def main():
265+
parser = argparse.ArgumentParser(
266+
description="This script builds a single dataset."
267+
)
268+
parser.add_argument('--dataset', required=True, help='Name of the dataset to build')
269+
parser.add_argument('--use_prev_dataset', help='Prefix of the previous dataset for sample and drug ID assignment')
270+
parser.add_argument('--build', action='store_true', help='Run data build.')
271+
parser.add_argument('--validate', action='store_true', help='Run schema checker on the built files')
272+
parser.add_argument('--continue', dest='should_continue', action='store_true', help='Continue from where the build left off by skipping existing files')
273+
274+
args = parser.parse_args()
275+
276+
if not os.path.exists('local'):
277+
os.mkdir('local')
278+
279+
# Build Docker Image
280+
process_docker(args.dataset,args.validate)
281+
282+
if args.build:
283+
# Use ThreadPoolExecutor for parallel execution
284+
with ThreadPoolExecutor() as executor:
285+
# Always build genes file
286+
process_genes(executor)
287+
288+
# Build samples and drugs
289+
samples_future = executor.submit(process_samples, executor, args.dataset, args.use_prev_dataset, args.should_continue)
290+
drugs_future = executor.submit(process_drugs, executor, args.dataset, args.use_prev_dataset, args.should_continue)
291+
292+
samples_future.result()
293+
drugs_future.result()
294+
295+
print("Samples and Drugs Files Completed.")
296+
297+
with ThreadPoolExecutor() as executor:
298+
299+
# Build omics and experiments
300+
omics_future = executor.submit(process_omics, executor, args.dataset, args.should_continue)
301+
experiments_future = executor.submit(process_experiments, executor, args.dataset, args.should_continue)
302+
303+
omics_future.result()
304+
experiments_future.result()
305+
306+
print("Experiments and Omics Files completed.")
307+
308+
with ThreadPoolExecutor() as executor:
309+
310+
if args.build:
311+
misc_thread = executor.submit(process_misc, executor, args.dataset)
312+
if args.build:
313+
misc_thread.result()
314+
print("Final build step complete.")
315+
316+
if args.validate:
317+
run_schema_checker(args.dataset)
318+
print("Validation completed.")
319+
320+
if __name__ == '__main__':
321+
main()

build/docker/Dockerfile.mpnst

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,44 @@
11
FROM r-base:4.3.2
2+
3+
# Set environment to noninteractive
24
ENV DEBIAN_FRONTEND=noninteractive
3-
RUN apt-get update --allow-insecure-repositories
4-
#RUN apt-get install -y --allow-unauthenticated build-essential --fix-missing libpq-dev python3-pip python3-setuptools python3-dev python3-venv libcurl4-openssl-dev libxml2-dev
5-
RUN apt-get install -y --allow-unauthenticated build-essential --fix-missing python3-pip python3-setuptools python3-dev python3-venv libcurl4-openssl-dev libxml2-dev
65

6+
# Update package list and install required packages
7+
RUN apt-get update && \
8+
apt-get install -y build-essential wget curl libcurl4-openssl-dev libxml2-dev \
9+
zlib1g-dev libssl-dev libbz2-dev libreadline-dev libsqlite3-dev libffi-dev
10+
11+
# Download and compile Python 3.10 with shared library support
12+
RUN wget https://www.python.org/ftp/python/3.10.12/Python-3.10.12.tgz && \
13+
tar -xf Python-3.10.12.tgz && \
14+
cd Python-3.10.12 && \
15+
./configure --enable-optimizations --enable-shared && \
16+
make -j$(nproc) && \
17+
make altinstall && \
18+
cd .. && \
19+
rm -rf Python-3.10.12.tgz Python-3.10.12
20+
21+
# Set Python 3.10 as default
22+
RUN ln -s /usr/local/bin/python3.10 /usr/bin/python3 && \
23+
ln -s /usr/local/bin/pip3.10 /usr/bin/pip3
24+
25+
# Update library paths for Python shared library
26+
RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/python3.10.conf && ldconfig
27+
28+
# Create a Python virtual environment
729
RUN python3 -m venv /opt/venv
8-
RUN /opt/venv/bin/pip3 install --upgrade pip
9-
30+
RUN /opt/venv/bin/pip install --upgrade pip
1031

11-
ENV PYTHONPATH "${PYTHONPATH}:/app"
32+
# Set environment variables for reticulate
33+
ENV RETICULATE_PYTHON="/opt/venv/bin/python3"
34+
ENV PYTHONPATH=/app#"${PYTHONPATH}:/app"
1235
WORKDIR /app
1336

37+
# Set MPLCONFIGDIR to a writable directory
38+
ENV MPLCONFIGDIR=/app/tmp/matplotlib
39+
RUN mkdir -p /app/tmp/matplotlib
40+
41+
# Add necessary files to the container
1442
ADD build/mpnst/requirements.txt .
1543
ADD build/mpnst/requirements.r .
1644
ADD build/mpnst/* ./
@@ -19,8 +47,8 @@ ADD build/utils/* ./
1947
# installing python libraries
2048
RUN /opt/venv/bin/pip3 install -r requirements.txt
2149

22-
# installing r libraries
50+
# Install all R libraries from requirements.r
2351
RUN Rscript requirements.r
2452

25-
53+
# Set up volume for temporary storage
2654
VOLUME ["/tmp"]

0 commit comments

Comments
 (0)