Skip to content

Commit 9262ec5

Browse files
committed
Working mapping scripts into build_all.py. In progress still
1 parent 7efbc25 commit 9262ec5

4 files changed

Lines changed: 484 additions & 104 deletions

File tree

build/build_all.py

Lines changed: 88 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -11,37 +11,37 @@
1111
import gzip
1212
from glob import glob
1313
import sys
14+
import requests
1415

1516
def main():
1617
parser=argparse.ArgumentParser(
17-
description="This script initializes all docker containers, builds datasets, validates them, and uploads to Figshare and PyPI.",
18+
description="This script initializes all docker containers, builds datasets, validates them, and uploads to Figshare.",
1819
epilog="""Examples of usage:
1920
20-
Build all datasets in a high memory environment, validate them, and upload to Figshare and PyPI:
21-
python build/build_all.py --all --high_mem --validate --pypi --figshare --version 0.1.29
21+
Build all datasets in a high memory environment, validate them, and upload to Figshare:
22+
python build/build_all.py --all --high_mem --validate --figshare --version 0.1.29
2223
2324
Build only experiment files. This assumes preceding steps (docker images, samples, omics, and drugs) have already been completed:
2425
python build/build_all.py --exp
2526
2627
Validate all local files without building or uploading. These files must be located in ./local. Includes compression/decompression steps.
2728
python build/build_all.py --validate
2829
29-
Upload the latest data to Figshare and PyPI (ensure tokens are set in the local environment):
30-
python build/build_all.py --figshare --pypi --version 0.1.30
30+
Upload the latest data to Figshare (ensure tokens are set in the local environment):
31+
python build/build_all.py --figshare --version 0.1.30
3132
"""
3233
)
3334
parser.add_argument('--docker',dest='docker',default=False,action='store_true', help="Build all docker images.")
3435
parser.add_argument('--samples',dest='samples',default=False,action='store_true', help="Build all sample files.")
3536
parser.add_argument('--omics',dest='omics',default=False,action='store_true', help="Build all omics files.")
3637
parser.add_argument('--drugs',dest='drugs',default=False,action='store_true', help="Build all drug files")
3738
parser.add_argument('--exp',dest='exp',default=False,action='store_true', help="Build all experiment file.")
38-
parser.add_argument('--validate', action='store_true', help="Run schema checker on all local files. Note this will be run, whether specified or not, if figshare or pypi arguments are included.")
39+
parser.add_argument('--validate', action='store_true', help="Run schema checker on all local files. Note this will be run, whether specified or not, if figshare arguments are included.")
3940
parser.add_argument('--figshare', action='store_true', help="Upload all local data to Figshare. FIGSHARE_TOKEN must be set in local environment.")
40-
parser.add_argument('--pypi', action='store_true', help="Update PYPI Package with latest Figshare data. PYPI_TOKEN must be set in local environment.")
41-
parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate, figshare, or pypi commands.")
41+
parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate or figshare commands")
4242
parser.add_argument('--high_mem',dest='high_mem',default=False,action='store_true',help = "If you have 32 or more CPUs, this option is recommended. It will run many code portions in parallel. If you don't have enough memory, this will cause a run failure.")
4343
parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx',help='Datasets to process. Defaults to all available.')
44-
parser.add_argument('--version', type=str, required=False, help='Version number for the PyPI package and Figshare upload title (e.g., "0.1.29"). This is required for Figshare and PyPI upload. This must be a higher version than previously published versions.')
44+
parser.add_argument('--version', type=str, required=False, help='Version number for the Figshare upload title (e.g., "0.1.29"). This is required for Figshare upload. This must be a higher version than previously published versions.')
4545
parser.add_argument('--github-username', type=str, required=False, help='GitHub username for the repository.')
4646
parser.add_argument('--github-email', type=str, required=False, help='GitHub email for the repository.')
4747

@@ -131,7 +131,7 @@ def process_docker(datasets):
131131
datasets_to_build.extend(dataset_map.get(dataset, []))
132132

133133
# Build the docker-compose command, adding specific datasets
134-
compose_command = ['docker-compose', '-f', compose_file, 'build', '--parallel'] + datasets_to_build
134+
compose_command = ['docker', 'compose', '-f', compose_file, 'build', '--parallel'] + datasets_to_build
135135

136136
log_file_path = 'local/docker.log'
137137
env = os.environ.copy()
@@ -266,12 +266,14 @@ def run_docker_upload_cmd(cmd_arr, all_files_dir, name, version):
266266
docker_run = ['docker', 'run', '--rm', '-v', f"{env['PWD']}/local/{all_files_dir}:/tmp", '-e', f"VERSION={version}"]
267267

268268
# Add Appropriate Environment Variables
269-
if 'PYPI_TOKEN' in env and name == 'PyPI':
270-
docker_run.extend(['-e', f"PYPI_TOKEN={env['PYPI_TOKEN']}", 'upload'])
269+
if name == "validate":
270+
docker_run.extend(['upload'])
271271
if 'FIGSHARE_TOKEN' in env and name == 'Figshare':
272272
docker_run.extend(['-e', f"FIGSHARE_TOKEN={env['FIGSHARE_TOKEN']}", 'upload'])
273273
if name == "validate":
274274
docker_run.extend(['upload'])
275+
if name == "Map_Drugs" or name == "Map_Samples":
276+
docker_run.extend(['upload'])
275277
if 'GITHUB_TOKEN' in env and name == "GitHub":
276278
docker_run.extend(['-e', f"GITHUB_TOKEN={env['GITHUB_TOKEN']}", 'upload'])
277279

@@ -302,22 +304,31 @@ def compress_file(file_path):
302304
with gzip.open(compressed_file_path, 'wb') as f_out:
303305
shutil.copyfileobj(f_in, f_out)
304306
os.remove(file_path)
307+
308+
def get_latest_commit_hash(owner, repo, branch='main'):
309+
"""
310+
Returns the SHA of the latest commit on the specified branch.
311+
"""
312+
url = f"https://api.github.com/repos/{owner}/{repo}/commits/{branch}"
313+
response = requests.get(url)
314+
response.raise_for_status()
315+
316+
# The commit data is in JSON format; the 'sha' field is the full commit hash.
317+
commit_data = response.json()
318+
return commit_data['sha']
305319

306320
######
307321
### Pre-Build Environment Token Check
308322
#####
309323

310324
figshare_token = os.getenv('FIGSHARE_TOKEN')
311-
pypi_token = os.getenv('PYPI_TOKEN')
312325
synapse_auth_token = os.getenv('SYNAPSE_AUTH_TOKEN')
313326
github_token = os.getenv('GITHUB_TOKEN')
314327

315328

316329
# Error handling for required tokens
317330
if args.figshare and not figshare_token:
318331
raise ValueError("FIGSHARE_TOKEN environment variable is not set.")
319-
if args.pypi and not pypi_token:
320-
raise ValueError("PYPI_TOKEN environment variable is not set.")
321332
if ('beataml' in args.datasets or 'mpnst' in args.datasets) and not synapse_auth_token:
322333
if args.docker or args.samples or args.omics or args.drugs or args.exp or args.all: # Token only required if building data, not upload or validate.
323334
raise ValueError("SYNAPSE_AUTH_TOKEN is required for accessing MPNST and beatAML datasets.")
@@ -393,77 +404,74 @@ def compress_file(file_path):
393404
######
394405
### Begin Upload and/or validation
395406
#####
396-
397-
if args.pypi or args.figshare or args.validate:
407+
if args.figshare or args.validate or github_token:
408+
# if args.figshare or args.validate:
398409
# FigShare File Prefixes:
399-
prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'genes', 'drugs']
400-
broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"]
401-
if "broad_sanger" in datasets:
402-
prefixes.extend(broad_sanger_datasets)
403-
datasets.extend(broad_sanger_datasets)
404-
datasets.remove("broad_sanger")
405-
406410

407-
figshare_token = os.getenv('FIGSHARE_TOKEN')
408-
pypi_token = os.getenv('PYPI_TOKEN')
409-
410-
all_files_dir = 'local/all_files_dir'
411-
if not os.path.exists(all_files_dir):
412-
os.makedirs(all_files_dir)
413-
414-
# Ensure pypi tokens are available
415-
if args.pypi and not pypi_token:
416-
raise ValueError("Required tokens (PYPI) are not set in environment variables.")
411+
# prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'genes', 'drugs']
412+
# broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"]
413+
# if "broad_sanger" in datasets:
414+
# prefixes.extend(broad_sanger_datasets)
415+
# datasets.extend(broad_sanger_datasets)
416+
# datasets.remove("broad_sanger")
417+
418+
# figshare_token = os.getenv('FIGSHARE_TOKEN')
419+
420+
# all_files_dir = 'local/all_files_dir'
421+
# if not os.path.exists(all_files_dir):
422+
# os.makedirs(all_files_dir)
417423

418-
# Ensure figshare tokens are available
419-
if args.figshare and not figshare_token:
420-
raise ValueError("Required tokens (FIGSHARE) are not set in environment variables.")
424+
# # Ensure figshare tokens are available
425+
# if args.figshare and not figshare_token:
426+
# raise ValueError("Required tokens (FIGSHARE) are not set in environment variables.")
421427

422-
# Ensure version is specified
423-
if (args.figshare or args.pypi) and not args.version:
424-
raise ValueError("Version must be specified when pushing to pypi or figshare")
425-
426-
# Move relevant files to a designated directory
427-
for file in glob(os.path.join("local", '*.*')):
428-
if any(file.startswith(os.path.join("local", prefix)) for prefix in prefixes):
429-
shutil.move(file, os.path.join(all_files_dir, os.path.basename(file)))
430-
431-
# Decompress all compressed files in the directory for schema checking
432-
for file in glob(os.path.join(all_files_dir, '*.gz')):
433-
decompress_file(file)
434-
435-
# Run schema checker - This will always run if uploading data.
436-
schema_check_command = ['python3', 'check_schema.py', '--datasets'] + datasets
437-
run_docker_upload_cmd(schema_check_command, 'all_files_dir', 'validate', args.version)
428+
# # Ensure version is specified
429+
# if args.figshare and not args.version:
430+
# raise ValueError("Version must be specified when pushing to figshare")
431+
432+
# # Move relevant files to a designated directory
433+
# for file in glob(os.path.join("local", '*.*')):
434+
# if any(file.startswith(os.path.join("local", prefix)) for prefix in prefixes):
435+
# shutil.move(file, os.path.join(all_files_dir, os.path.basename(file)))
436+
437+
# # Decompress all compressed files in the directory for schema checking
438+
# for file in glob(os.path.join(all_files_dir, '*.gz')):
439+
# decompress_file(file)
440+
441+
# # Run schema checker - This will always run if uploading data.
442+
# schema_check_command = ['python3', 'scripts/check_schema.py', '--datasets'] + datasets
443+
# run_docker_upload_cmd(schema_check_command, 'all_files_dir', 'validate', args.version)
438444

439-
print("Validation complete. Proceeding with file compression/decompression adjustments")
445+
# print("Validation complete. Proceeding with file compression/decompression adjustments")
440446

441-
# Compress or decompress files based on specific conditions after checking
442-
for file in glob(os.path.join(all_files_dir, '*')):
443-
is_compressed = file.endswith('.gz')
444-
if ('samples' in file or 'figshare' in file) and is_compressed:
445-
decompress_file(file)
446-
elif not ('samples' in file or 'figshare' in file) and not is_compressed:
447-
compress_file(file)
448-
449-
print("File compression and decompression adjustments are complete.")
447+
# # Compress or decompress files based on specific conditions after checking
448+
# for file in glob(os.path.join(all_files_dir, '*')):
449+
# is_compressed = file.endswith('.gz')
450+
# if ('samples' in file or 'figshare' in file) and is_compressed:
451+
# decompress_file(file)
452+
# elif not ('samples' in file or 'figshare' in file) and not is_compressed:
453+
# compress_file(file)
454+
455+
# print("File compression and decompression adjustments are complete.")
450456

451-
# Upload to Figshare using Docker
452-
if args.figshare and args.version and figshare_token:
453-
figshare_command = ['python3', 'scripts/push_to_figshare.py', '--directory', "/tmp", '--title', f"CODERData{args.version}", '--token', os.getenv('FIGSHARE_TOKEN'), '--project_id', '189342', '--publish']
454-
run_docker_upload_cmd(figshare_command, 'all_files_dir', 'Figshare', args.version)
455-
456-
# Upload to PyPI using Docker
457-
if args.pypi and args.version and pypi_token:
458-
pypi_command = ['python3', 'scripts/push_to_pypi.py', '-y', '/tmp/figshare_latest.yml', '-d', 'coderdata/download/downloader.py', "-v", args.version]
459-
run_docker_upload_cmd(pypi_command, 'all_files_dir', 'PyPI', args.version)
457+
# # Upload to Figshare using Docker
458+
# if args.figshare and args.version and figshare_token:
459+
# figshare_command = ['python3', 'scripts/push_to_figshare.py', '--directory', "/tmp", '--title', f"CODERData{args.version}", '--token', os.getenv('FIGSHARE_TOKEN'), '--project_id', '189342', '--publish']
460+
# run_docker_upload_cmd(figshare_command, 'all_files_dir', 'Figshare', args.version)
461+
460462

461463
# Push changes to GitHub using Docker
462-
if args.version and args.figshare and args.pypi and pypi_token and figshare_token and github_token and args.github_username and args.github_email:
464+
# if args.version and args.figshare and figshare_token and github_token and args.github_username and args.github_email:
465+
if args.version and github_token and args.github_username and args.github_email:
466+
463467
git_command = [
464468
'bash', '-c', (
465469
f'git config --global user.name "{args.github_username}" '
466470
f'&& git config --global user.email "{args.github_email}" '
471+
f'&& cp /tmp/improve_sample_mapping.json /usr/src/app/coderdata/build/improve_sample_mapping.json '
472+
f'&& cp /tmp/improve_drug_mapping.json /usr/src/app/coderdata/build/improve_drug_mapping.json '
473+
f'&& git add build/improve_sample_mapping.json '
474+
f'&& git add build/improve_drug_mapping.json '
467475
f'&& cp /tmp/figshare_latest.yml /usr/src/app/coderdata/docs/_data/figshare_latest.yml '
468476
f'&& git add docs/_data/figshare_latest.yml '
469477
f'&& git commit -m "Data Built and Uploaded. New Tag: {args.version}" '
@@ -472,8 +480,14 @@ def compress_file(file_path):
472480
f'&& git push https://{args.github_username}:{github_token}@github.com/PNNL-CompBio/coderdata.git --tags'
473481
)
474482
]
475-
run_docker_upload_cmd(git_command, 'all_files_dir', 'GitHub', args.version)
476483

484+
sample_mapping_command = ['python3', 'scripts/map_improve_sample_ids.py', '--local_dir', "/tmp", '--version', args.version]
485+
run_docker_upload_cmd(sample_mapping_command, 'all_files_dir', 'Map_Samples', args.version)
477486

487+
drug_mapping_command = ['python3', 'scripts/map_improve_drug_ids.py', '--local_dir', "/tmp", '--version', args.version]
488+
run_docker_upload_cmd(drug_mapping_command, 'all_files_dir', 'Map_Drugs', args.version)
489+
490+
run_docker_upload_cmd(git_command, 'all_files_dir', 'GitHub', args.version)
491+
478492
if __name__ == '__main__':
479493
main()

build/docker/Dockerfile.upload

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,21 @@ FROM python:3.9
22

33
WORKDIR /usr/src/app
44

5-
RUN python -m pip install --upgrade pip setuptools wheel twine packaging pyyaml requests linkml
5+
RUN python -m pip install --upgrade pip pyyaml requests linkml
66

77
RUN apt-get update && apt-get install -y git
88

99

10-
COPY ./schema /usr/src/app/schema
11-
ADD scripts/check_schema.py ./
10+
RUN git clone https://github.com/PNNL-CompBio/coderdata.git
11+
WORKDIR /usr/src/app/coderdata
12+
13+
# COPY ./schema /usr/src/app/schema
14+
# ADD scripts/check_schema.py ./
15+
16+
# #Add id mapping script
17+
# ADD scripts/map_improve_sample_ids.py ./
18+
# ADD scripts/map_improve_drug_ids.py ./
19+
20+
# #Add improve_mapping.json file. If file is not present this won't fail.
21+
# ADD build/* ./
22+

0 commit comments

Comments
 (0)