1111import gzip
1212from glob import glob
1313import sys
14+ import requests
1415
1516def main ():
1617 parser = argparse .ArgumentParser (
17- description = "This script initializes all docker containers, builds datasets, validates them, and uploads to Figshare and PyPI ." ,
18+ description = "This script initializes all docker containers, builds datasets, validates them, and uploads to Figshare." ,
1819 epilog = """Examples of usage:
1920
20- Build all datasets in a high memory environment, validate them, and upload to Figshare and PyPI :
21- python build/build_all.py --all --high_mem --validate --pypi -- figshare --version 0.1.29
21+ Build all datasets in a high memory environment, validate them, and upload to Figshare:
22+ python build/build_all.py --all --high_mem --validate --figshare --version 0.1.29
2223
2324Build only experiment files. This assumes preceding steps (docker images, samples, omics, and drugs) have already been completed:
2425 python build/build_all.py --exp
2526
2627Validate all local files without building or uploading. These files must be located in ./local. Includes compression/decompression steps.
2728 python build/build_all.py --validate
2829
29- Upload the latest data to Figshare and PyPI (ensure tokens are set in the local environment):
30- python build/build_all.py --figshare --pypi -- version 0.1.30
30+ Upload the latest data to Figshare (ensure tokens are set in the local environment):
31+ python build/build_all.py --figshare --version 0.1.30
3132 """
3233 )
3334 parser .add_argument ('--docker' ,dest = 'docker' ,default = False ,action = 'store_true' , help = "Build all docker images." )
3435 parser .add_argument ('--samples' ,dest = 'samples' ,default = False ,action = 'store_true' , help = "Build all sample files." )
3536 parser .add_argument ('--omics' ,dest = 'omics' ,default = False ,action = 'store_true' , help = "Build all omics files." )
3637 parser .add_argument ('--drugs' ,dest = 'drugs' ,default = False ,action = 'store_true' , help = "Build all drug files" )
3738 parser .add_argument ('--exp' ,dest = 'exp' ,default = False ,action = 'store_true' , help = "Build all experiment file." )
38- parser .add_argument ('--validate' , action = 'store_true' , help = "Run schema checker on all local files. Note this will be run, whether specified or not, if figshare or pypi arguments are included." )
39+ parser .add_argument ('--validate' , action = 'store_true' , help = "Run schema checker on all local files. Note this will be run, whether specified or not, if figshare arguments are included." )
3940 parser .add_argument ('--figshare' , action = 'store_true' , help = "Upload all local data to Figshare. FIGSHARE_TOKEN must be set in local environment." )
40- parser .add_argument ('--pypi' , action = 'store_true' , help = "Update PYPI Package with latest Figshare data. PYPI_TOKEN must be set in local environment." )
41- parser .add_argument ('--all' ,dest = 'all' ,default = False ,action = 'store_true' , help = "Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate, figshare, or pypi commands." )
41+ parser .add_argument ('--all' ,dest = 'all' ,default = False ,action = 'store_true' , help = "Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate or figshare commands" )
4242 parser .add_argument ('--high_mem' ,dest = 'high_mem' ,default = False ,action = 'store_true' ,help = "If you have 32 or more CPUs, this option is recommended. It will run many code portions in parallel. If you don't have enough memory, this will cause a run failure." )
4343 parser .add_argument ('--dataset' ,dest = 'datasets' ,default = 'broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx' ,help = 'Datasets to process. Defaults to all available.' )
44- parser .add_argument ('--version' , type = str , required = False , help = 'Version number for the PyPI package and Figshare upload title (e.g., "0.1.29"). This is required for Figshare and PyPI upload. This must be a higher version than previously published versions.' )
44+ parser .add_argument ('--version' , type = str , required = False , help = 'Version number for the Figshare upload title (e.g., "0.1.29"). This is required for Figshare upload. This must be a higher version than previously published versions.' )
4545 parser .add_argument ('--github-username' , type = str , required = False , help = 'GitHub username for the repository.' )
4646 parser .add_argument ('--github-email' , type = str , required = False , help = 'GitHub email for the repository.' )
4747
@@ -131,7 +131,7 @@ def process_docker(datasets):
131131 datasets_to_build .extend (dataset_map .get (dataset , []))
132132
133133 # Build the docker-compose command, adding specific datasets
134- compose_command = ['docker- compose' , '-f' , compose_file , 'build' , '--parallel' ] + datasets_to_build
134+ compose_command = ['docker' , ' compose' , '-f' , compose_file , 'build' , '--parallel' ] + datasets_to_build
135135
136136 log_file_path = 'local/docker.log'
137137 env = os .environ .copy ()
@@ -266,12 +266,14 @@ def run_docker_upload_cmd(cmd_arr, all_files_dir, name, version):
266266 docker_run = ['docker' , 'run' , '--rm' , '-v' , f"{ env ['PWD' ]} /local/{ all_files_dir } :/tmp" , '-e' , f"VERSION={ version } " ]
267267
268268 # Add Appropriate Environment Variables
269- if 'PYPI_TOKEN' in env and name == 'PyPI' :
270- docker_run .extend (['-e' , f"PYPI_TOKEN= { env [ 'PYPI_TOKEN' ] } " , ' upload' ])
269+ if name == "validate" :
270+ docker_run .extend (['upload' ])
271271 if 'FIGSHARE_TOKEN' in env and name == 'Figshare' :
272272 docker_run .extend (['-e' , f"FIGSHARE_TOKEN={ env ['FIGSHARE_TOKEN' ]} " , 'upload' ])
273273 if name == "validate" :
274274 docker_run .extend (['upload' ])
275+ if name == "Map_Drugs" or name == "Map_Samples" :
276+ docker_run .extend (['upload' ])
275277 if 'GITHUB_TOKEN' in env and name == "GitHub" :
276278 docker_run .extend (['-e' , f"GITHUB_TOKEN={ env ['GITHUB_TOKEN' ]} " , 'upload' ])
277279
@@ -302,22 +304,31 @@ def compress_file(file_path):
302304 with gzip .open (compressed_file_path , 'wb' ) as f_out :
303305 shutil .copyfileobj (f_in , f_out )
304306 os .remove (file_path )
307+
308+ def get_latest_commit_hash (owner , repo , branch = 'main' ):
309+ """
310+ Returns the SHA of the latest commit on the specified branch.
311+ """
312+ url = f"https://api.github.com/repos/{ owner } /{ repo } /commits/{ branch } "
313+ response = requests .get (url )
314+ response .raise_for_status ()
315+
316+ # The commit data is in JSON format; the 'sha' field is the full commit hash.
317+ commit_data = response .json ()
318+ return commit_data ['sha' ]
305319
306320 ######
307321 ### Pre-Build Environment Token Check
308322 #####
309323
310324 figshare_token = os .getenv ('FIGSHARE_TOKEN' )
311- pypi_token = os .getenv ('PYPI_TOKEN' )
312325 synapse_auth_token = os .getenv ('SYNAPSE_AUTH_TOKEN' )
313326 github_token = os .getenv ('GITHUB_TOKEN' )
314327
315328
316329 # Error handling for required tokens
317330 if args .figshare and not figshare_token :
318331 raise ValueError ("FIGSHARE_TOKEN environment variable is not set." )
319- if args .pypi and not pypi_token :
320- raise ValueError ("PYPI_TOKEN environment variable is not set." )
321332 if ('beataml' in args .datasets or 'mpnst' in args .datasets ) and not synapse_auth_token :
322333 if args .docker or args .samples or args .omics or args .drugs or args .exp or args .all : # Token only required if building data, not upload or validate.
323334 raise ValueError ("SYNAPSE_AUTH_TOKEN is required for accessing MPNST and beatAML datasets." )
@@ -393,77 +404,74 @@ def compress_file(file_path):
393404 ######
394405 ### Begin Upload and/or validation
395406 #####
396-
397- if args . pypi or args .figshare or args .validate :
407+ if args . figshare or args . validate or github_token :
408+ # if args.figshare or args.validate:
398409 # FigShare File Prefixes:
399- prefixes = ['beataml' , 'hcmi' , 'cptac' , 'mpnst' , 'genes' , 'drugs' ]
400- broad_sanger_datasets = ["ccle" ,"ctrpv2" ,"fimm" ,"gdscv1" ,"gdscv2" ,"gcsi" ,"prism" ,"nci60" ]
401- if "broad_sanger" in datasets :
402- prefixes .extend (broad_sanger_datasets )
403- datasets .extend (broad_sanger_datasets )
404- datasets .remove ("broad_sanger" )
405-
406410
407- figshare_token = os .getenv ('FIGSHARE_TOKEN' )
408- pypi_token = os .getenv ('PYPI_TOKEN' )
409-
410- all_files_dir = 'local/all_files_dir'
411- if not os .path .exists (all_files_dir ):
412- os .makedirs (all_files_dir )
413-
414- # Ensure pypi tokens are available
415- if args .pypi and not pypi_token :
416- raise ValueError ("Required tokens (PYPI) are not set in environment variables." )
411+ # prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'genes', 'drugs']
412+ # broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"]
413+ # if "broad_sanger" in datasets:
414+ # prefixes.extend(broad_sanger_datasets)
415+ # datasets.extend(broad_sanger_datasets)
416+ # datasets.remove("broad_sanger")
417+
418+ # figshare_token = os.getenv('FIGSHARE_TOKEN')
419+
420+ # all_files_dir = 'local/all_files_dir'
421+ # if not os.path.exists(all_files_dir):
422+ # os.makedirs(all_files_dir)
417423
418- # Ensure figshare tokens are available
419- if args .figshare and not figshare_token :
420- raise ValueError ("Required tokens (FIGSHARE) are not set in environment variables." )
424+ # # Ensure figshare tokens are available
425+ # if args.figshare and not figshare_token:
426+ # raise ValueError("Required tokens (FIGSHARE) are not set in environment variables.")
421427
422- # Ensure version is specified
423- if ( args .figshare or args . pypi ) and not args .version :
424- raise ValueError ("Version must be specified when pushing to pypi or figshare" )
425-
426- # Move relevant files to a designated directory
427- for file in glob (os .path .join ("local" , '*.*' )):
428- if any (file .startswith (os .path .join ("local" , prefix )) for prefix in prefixes ):
429- shutil .move (file , os .path .join (all_files_dir , os .path .basename (file )))
430-
431- # Decompress all compressed files in the directory for schema checking
432- for file in glob (os .path .join (all_files_dir , '*.gz' )):
433- decompress_file (file )
434-
435- # Run schema checker - This will always run if uploading data.
436- schema_check_command = ['python3' , 'check_schema.py' , '--datasets' ] + datasets
437- run_docker_upload_cmd (schema_check_command , 'all_files_dir' , 'validate' , args .version )
428+ # # Ensure version is specified
429+ # if args.figshare and not args.version:
430+ # raise ValueError("Version must be specified when pushing to figshare")
431+
432+ # # Move relevant files to a designated directory
433+ # for file in glob(os.path.join("local", '*.*')):
434+ # if any(file.startswith(os.path.join("local", prefix)) for prefix in prefixes):
435+ # shutil.move(file, os.path.join(all_files_dir, os.path.basename(file)))
436+
437+ # # Decompress all compressed files in the directory for schema checking
438+ # for file in glob(os.path.join(all_files_dir, '*.gz')):
439+ # decompress_file(file)
440+
441+ # # Run schema checker - This will always run if uploading data.
442+ # schema_check_command = ['python3', 'scripts/ check_schema.py', '--datasets'] + datasets
443+ # run_docker_upload_cmd(schema_check_command, 'all_files_dir', 'validate', args.version)
438444
439- print ("Validation complete. Proceeding with file compression/decompression adjustments" )
445+ # print("Validation complete. Proceeding with file compression/decompression adjustments")
440446
441- # Compress or decompress files based on specific conditions after checking
442- for file in glob (os .path .join (all_files_dir , '*' )):
443- is_compressed = file .endswith ('.gz' )
444- if ('samples' in file or 'figshare' in file ) and is_compressed :
445- decompress_file (file )
446- elif not ('samples' in file or 'figshare' in file ) and not is_compressed :
447- compress_file (file )
448-
449- print ("File compression and decompression adjustments are complete." )
447+ # # Compress or decompress files based on specific conditions after checking
448+ # for file in glob(os.path.join(all_files_dir, '*')):
449+ # is_compressed = file.endswith('.gz')
450+ # if ('samples' in file or 'figshare' in file) and is_compressed:
451+ # decompress_file(file)
452+ # elif not ('samples' in file or 'figshare' in file) and not is_compressed:
453+ # compress_file(file)
454+
455+ # print("File compression and decompression adjustments are complete.")
450456
451- # Upload to Figshare using Docker
452- if args .figshare and args .version and figshare_token :
453- figshare_command = ['python3' , 'scripts/push_to_figshare.py' , '--directory' , "/tmp" , '--title' , f"CODERData{ args .version } " , '--token' , os .getenv ('FIGSHARE_TOKEN' ), '--project_id' , '189342' , '--publish' ]
454- run_docker_upload_cmd (figshare_command , 'all_files_dir' , 'Figshare' , args .version )
455-
456- # Upload to PyPI using Docker
457- if args .pypi and args .version and pypi_token :
458- pypi_command = ['python3' , 'scripts/push_to_pypi.py' , '-y' , '/tmp/figshare_latest.yml' , '-d' , 'coderdata/download/downloader.py' , "-v" , args .version ]
459- run_docker_upload_cmd (pypi_command , 'all_files_dir' , 'PyPI' , args .version )
457+ # # Upload to Figshare using Docker
458+ # if args.figshare and args.version and figshare_token:
459+ # figshare_command = ['python3', 'scripts/push_to_figshare.py', '--directory', "/tmp", '--title', f"CODERData{args.version}", '--token', os.getenv('FIGSHARE_TOKEN'), '--project_id', '189342', '--publish']
460+ # run_docker_upload_cmd(figshare_command, 'all_files_dir', 'Figshare', args.version)
461+
460462
461463 # Push changes to GitHub using Docker
462- if args .version and args .figshare and args .pypi and pypi_token and figshare_token and github_token and args .github_username and args .github_email :
464+ # if args.version and args.figshare and figshare_token and github_token and args.github_username and args.github_email:
465+ if args .version and github_token and args .github_username and args .github_email :
466+
463467 git_command = [
464468 'bash' , '-c' , (
465469 f'git config --global user.name "{ args .github_username } " '
466470 f'&& git config --global user.email "{ args .github_email } " '
471+ f'&& cp /tmp/improve_sample_mapping.json /usr/src/app/coderdata/build/improve_sample_mapping.json '
472+ f'&& cp /tmp/improve_drug_mapping.json /usr/src/app/coderdata/build/improve_drug_mapping.json '
473+ f'&& git add build/improve_sample_mapping.json '
474+ f'&& git add build/improve_drug_mapping.json '
467475 f'&& cp /tmp/figshare_latest.yml /usr/src/app/coderdata/docs/_data/figshare_latest.yml '
468476 f'&& git add docs/_data/figshare_latest.yml '
469477 f'&& git commit -m "Data Built and Uploaded. New Tag: { args .version } " '
@@ -472,8 +480,14 @@ def compress_file(file_path):
472480 f'&& git push https://{ args .github_username } :{ github_token } @github.com/PNNL-CompBio/coderdata.git --tags'
473481 )
474482 ]
475- run_docker_upload_cmd (git_command , 'all_files_dir' , 'GitHub' , args .version )
476483
484+ sample_mapping_command = ['python3' , 'scripts/map_improve_sample_ids.py' , '--local_dir' , "/tmp" , '--version' , args .version ]
485+ run_docker_upload_cmd (sample_mapping_command , 'all_files_dir' , 'Map_Samples' , args .version )
477486
487+ drug_mapping_command = ['python3' , 'scripts/map_improve_drug_ids.py' , '--local_dir' , "/tmp" , '--version' , args .version ]
488+ run_docker_upload_cmd (drug_mapping_command , 'all_files_dir' , 'Map_Drugs' , args .version )
489+
490+ run_docker_upload_cmd (git_command , 'all_files_dir' , 'GitHub' , args .version )
491+
478492if __name__ == '__main__' :
479493 main ()
0 commit comments