Skip to content

Commit 7156d25

Browse files
committed
Ready to merge
1 parent 65168ae commit 7156d25

5 files changed

Lines changed: 115 additions & 66 deletions

File tree

build/broad_sanger/05b_separate_datasets.py

Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import gc
22
import polars as pl
3-
4-
3+
import os
4+
import gzip
5+
import shutil
56

67
def main():
78
datasets_to_process = ["CCLE", "CTRPv2", "PRISM", "GDSCv1", "GDSCv2", "FIMM", "gCSI", "NCI60"]
@@ -23,58 +24,96 @@ def main():
2324
}
2425

2526
for dataset in datasets_to_process:
26-
exp = pl.read_csv("broad_sanger_experiments.tsv", separator="\t") # Keeping memory down, so I will not be making copies.
27+
exp_in_filename = "broad_sanger_experiments.tsv"
28+
if os.path.isfile(exp_in_filename + ".gz"):
29+
exp_in_filename = exp_in_filename + ".gz"
30+
31+
exp = pl.read_csv(exp_in_filename, separator="\t") # Keeping memory down, so I will not be making copies.
2732
exp = exp.filter(pl.col("study") == dataset)
2833

2934
# Extract information to separate out datasets
3035
exp_improve_sample_ids = exp["improve_sample_id"].unique().to_list()
3136
exp_improve_drug_ids = exp["improve_drug_id"].unique().to_list()
3237

3338
# Write Filtered Experiments File to TSV. Then delete it from memory.
34-
exp_filename = f"/tmp/{dataset}_experiments.tsv".lower()
35-
exp.write_csv(exp_filename, separator="\t")
39+
exp_filename_out = f"/tmp/{dataset}_experiments.tsv".lower()
40+
exp.write_csv(exp_filename_out, separator="\t")
41+
#Rewrite as gzipped if needed
42+
if exp_in_filename.endswith(".gz"):
43+
with open(exp_filename_out, 'rb') as f_in, gzip.open(exp_filename_out + ".gz", 'wb') as f_out:
44+
shutil.copyfileobj(f_in, f_out)
45+
os.remove(exp_filename_out)
46+
3647
del exp
3748
gc.collect()
3849

3950

4051
#Filter Samples files, write to file, delete from mem.
4152
for samples in samples_datatypes:
4253
samples_filename_in = f"broad_sanger_{samples}.csv"
54+
if os.path.isfile(samples_filename_in + ".gz"):
55+
samples_filename_in += ".gz"
56+
4357
samples_filename_out = f"/tmp/{dataset}_{samples}.csv".lower()
4458
samples_df = pl.read_csv(samples_filename_in)
4559
samples_df = samples_df.filter(pl.col("improve_sample_id").is_in(exp_improve_sample_ids))
4660
samples_df.write_csv(samples_filename_out) #csv
61+
62+
#Rewrite as gzipped if needed
63+
if samples_filename_in.endswith(".gz"):
64+
with open(samples_filename_out, 'rb') as f_in, gzip.open(samples_filename_out + ".gz", 'wb') as f_out:
65+
shutil.copyfileobj(f_in, f_out)
66+
os.remove(samples_filename_out)
67+
4768
del samples_df
4869
gc.collect()
4970

5071
#One by one, filter other Omics files, write to file, delete from mem.
5172
for omics in omics_datatypes:
5273
omics_filename_in = f"broad_sanger_{omics}.csv"
74+
if os.path.isfile(omics_filename_in + ".gz"):
75+
omics_filename_in += ".gz"
76+
5377
omics_filename_out = f"/tmp/{dataset}_{omics}.csv".lower()
5478
omics_df = pl.read_csv(omics_filename_in)
5579
omics_df = omics_df.filter(pl.col("improve_sample_id").is_in(exp_improve_sample_ids))
5680
omics_df = omics_df.filter(pl.col("source").is_in(dataset_sources[dataset]))
5781
omics_df.write_csv(omics_filename_out) #csv
82+
83+
#Rewrite as gzipped if needed
84+
if omics_filename_in.endswith(".gz"):
85+
with open(omics_filename_out, 'rb') as f_in, gzip.open(omics_filename_out + ".gz", 'wb') as f_out:
86+
shutil.copyfileobj(f_in, f_out)
87+
os.remove(omics_filename_out)
88+
5889
del omics_df
5990
gc.collect()
6091

6192

6293
#One by one, filter other Drugs files, write to file, delete from mem.
6394
for drugs in drugs_datatypes:
6495
drugs_filename_in = f"broad_sanger_{drugs}.tsv"
96+
if os.path.isfile(drugs_filename_in + ".gz"):
97+
drugs_filename_in += ".gz"
98+
6599
drugs_filename_out = f"/tmp/{dataset}_{drugs}.tsv".lower()
66100
if drugs == "drug_descriptors":
67101
drugs_df = pl.read_csv(drugs_filename_in,separator="\t",
68102
dtypes={"improve_drug_id": pl.Utf8,
69103
"structural_descriptor": pl.Utf8,
70104
"descriptor_value": pl.Utf8}
71105
)
72-
73106
else:
74107
drugs_df = pl.read_csv(drugs_filename_in,separator="\t")
75108

76109
drugs_df = drugs_df.filter(pl.col("improve_drug_id").is_in(exp_improve_drug_ids))
77110
drugs_df.write_csv(drugs_filename_out,separator="\t") #tsv
111+
112+
if drugs_filename_in.endswith(".gz"):
113+
with open(drugs_filename_out, 'rb') as f_in, gzip.open(drugs_filename_out + ".gz", 'wb') as f_out:
114+
shutil.copyfileobj(f_in, f_out)
115+
os.remove(drugs_filename_out)
116+
78117
del drugs_df
79118
gc.collect()
80119

build/build_all.py

Lines changed: 45 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -402,8 +402,8 @@ def get_latest_commit_hash(owner, repo, branch='main'):
402402
######
403403
### Begin Upload and/or validation
404404
#####
405-
# if args.figshare or args.validate or github_token:
406-
if args.figshare or args.validate:
405+
if args.figshare or args.validate or github_token:
406+
# if args.figshare or args.validate:
407407
# FigShare File Prefixes:
408408

409409
prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'genes', 'drugs']
@@ -436,6 +436,13 @@ def get_latest_commit_hash(owner, repo, branch='main'):
436436
for file in glob(os.path.join(all_files_dir, '*.gz')):
437437
decompress_file(file)
438438

439+
### These should be done before schema checking.
440+
sample_mapping_command = ['python3', 'scripts/map_improve_sample_ids.py', '--local_dir', "/tmp", '--version', args.version]
441+
run_docker_upload_cmd(sample_mapping_command, 'all_files_dir', 'Map_Samples', args.version)
442+
443+
drug_mapping_command = ['python3', 'scripts/map_improve_drug_ids.py', '--local_dir', "/tmp", '--version', args.version]
444+
run_docker_upload_cmd(drug_mapping_command, 'all_files_dir', 'Map_Drugs', args.version)
445+
439446
# Run schema checker - This will always run if uploading data.
440447
schema_check_command = ['python3', 'scripts/check_schema.py', '--datasets'] + datasets
441448
run_docker_upload_cmd(schema_check_command, 'all_files_dir', 'validate', args.version)
@@ -452,40 +459,47 @@ def get_latest_commit_hash(owner, repo, branch='main'):
452459

453460
print("File compression and decompression adjustments are complete.")
454461

455-
# Upload to Figshare using Docker
462+
### Upload to Figshare using Docker
456463
if args.figshare and args.version and figshare_token:
457-
figshare_command = ['python3', 'scripts/push_to_figshare.py', '--directory', "/tmp", '--title', f"CODERData{args.version}", '--token', os.getenv('FIGSHARE_TOKEN'), '--project_id', '189342', '--publish']
464+
figshare_command = ['python3', 'scripts/push_to_figshare.py', '--directory', "/tmp", '--title', f"CODERData{args.version}", '--token', os.getenv('FIGSHARE_TOKEN'), '--project_id', '189342', '--version', args.version, '--publish']
458465
run_docker_upload_cmd(figshare_command, 'all_files_dir', 'Figshare', args.version)
459466

467+
### Push changes to GitHub using Docker
468+
# if args.version and args.figshare and figshare_token and github_token and args.github_username and args.github_email:
460469

461-
# Push changes to GitHub using Docker
462-
# if args.version and args.figshare and figshare_token and github_token and args.github_username and args.github_email:
463-
if args.version and github_token and args.github_username and args.github_email:
464-
465-
git_command = [
466-
'bash', '-c', (
467-
f'git config --global user.name "{args.github_username}" '
468-
f'&& git config --global user.email "{args.github_email}" '
469-
f'&& cp /tmp/improve_sample_mapping.json /usr/src/app/coderdata/build/improve_sample_mapping.json '
470-
f'&& cp /tmp/improve_drug_mapping.json /usr/src/app/coderdata/build/improve_drug_mapping.json '
471-
f'&& git add build/improve_sample_mapping.json '
472-
f'&& git add build/improve_drug_mapping.json '
473-
f'&& cp /tmp/figshare_latest.yml /usr/src/app/coderdata/docs/_data/figshare_latest.yml '
474-
f'&& git add docs/_data/figshare_latest.yml '
475-
f'&& git commit -m "Data Built and Uploaded. New Tag: {args.version}" '
476-
f'&& git tag {args.version} '
477-
f'&& git push https://{args.github_username}:{github_token}@github.com/PNNL-CompBio/coderdata.git main '
478-
f'&& git push https://{args.github_username}:{github_token}@github.com/PNNL-CompBio/coderdata.git --tags'
479-
)
480-
]
470+
# You can only upload to Github after Figshare upload is completed - otherwise figshare_latest.yml and dataset.yml won't be available.
471+
if args.version and github_token and args.github_username and args.github_email:
472+
473+
git_command = [
474+
'bash', '-c', (
475+
f'git config --global user.name "{args.github_username}" '
476+
f'&& git config --global user.email "{args.github_email}" '
477+
478+
# Checkout a new branch
479+
f'&& git checkout -b testing-auto-build-pr-{args.version} '
480+
481+
# Copy and add the necessary files
482+
f'&& cp /tmp/improve_sample_mapping.json.gz /usr/src/app/coderdata/build/improve_sample_mapping.json.gz '
483+
f'&& cp /tmp/improve_drug_mapping.json.gz /usr/src/app/coderdata/build/improve_drug_mapping.json.gz '
484+
f'&& gunzip /usr/src/app/coderdata/build/*.gz '
485+
f'&& git add -f build/improve_sample_mapping.json build/improve_drug_mapping.json '
486+
f'&& cp /tmp/figshare_latest.yml /usr/src/app/coderdata/docs/_data/figshare_latest.yml '
487+
f'&& cp /tmp/dataset.yml /usr/src/app/coderdata/coderdata/dataset.yml '
488+
f'&& git add -f docs/_data/figshare_latest.yml coderdata/dataset.yml'
489+
490+
# Tag and push
491+
f'&& git commit -m "Data Built and Uploaded. New Tag: {args.version}" '
492+
f'&& git tag {args.version} '
493+
f'&& git push https://{args.github_username}:{github_token}@github.com/PNNL-CompBio/coderdata.git testing-auto-build-pr-{args.version} '
494+
495+
# Create a PR using GitHub CLI
496+
f'&& gh pr create --title "Testing Auto PR instead of auto Merge {args.version}" '
497+
f'--body "This PR was automatically generated by the build process." '
498+
f'--base main --head testing-auto-build-pr-{args.version}'
499+
)
500+
]
481501

482-
sample_mapping_command = ['python3', 'scripts/map_improve_sample_ids.py', '--local_dir', "/tmp", '--version', args.version]
483-
run_docker_upload_cmd(sample_mapping_command, 'all_files_dir', 'Map_Samples', args.version)
484-
485-
drug_mapping_command = ['python3', 'scripts/map_improve_drug_ids.py', '--local_dir', "/tmp", '--version', args.version]
486-
run_docker_upload_cmd(drug_mapping_command, 'all_files_dir', 'Map_Drugs', args.version)
487-
488-
run_docker_upload_cmd(git_command, 'all_files_dir', 'GitHub', args.version)
502+
run_docker_upload_cmd(git_command, 'all_files_dir', 'GitHub', args.version)
489503

490504
if __name__ == '__main__':
491505
main()

build/build_dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def process_docker(dataset,validate):
5555

5656
datasets_to_build.extend(dataset_map.get(dataset, []))
5757

58-
compose_command = ['docker-compose', '-f', compose_file, 'build'] + datasets_to_build
58+
compose_command = ['docker','compose', '-f', compose_file, 'build'] + datasets_to_build
5959

6060
log_file_path = 'local/docker.log'
6161
env = os.environ.copy()
@@ -258,7 +258,7 @@ def run_schema_checker(dataset):
258258
decompress_file(os.path.join('local', all_files_dir, file))
259259

260260
# Run schema checker
261-
schema_check_command = ['python3', 'check_schema.py', '--datasets'] + datasets
261+
schema_check_command = ['python3', 'scripts/check_schema.py', '--datasets'] + datasets
262262
run_docker_validate_cmd(schema_check_command, all_files_dir, 'Validation')
263263

264264
def main():

build/docker/Dockerfile.upload

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,4 @@ RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg | \
2222

2323

2424
RUN git clone https://github.com/PNNL-CompBio/coderdata.git
25-
WORKDIR /usr/src/app/coderdata
26-
RUN git checkout sample_id_mapping_update
27-
28-
# COPY ./schema /usr/src/app/schema
29-
# ADD scripts/check_schema.py ./
30-
31-
# #Add id mapping script
32-
# ADD scripts/map_improve_sample_ids.py ./
33-
# ADD scripts/map_improve_drug_ids.py ./
34-
35-
# #Add improve_mapping.json file. If file is not present this won't fail.
36-
# ADD build/* ./
37-
25+
WORKDIR /usr/src/app/coderdata

scripts/push_to_figshare.py

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import yaml
88

99

10-
def upload_to_figshare(token, title, directory, project_id, publish, article_id=None):
10+
def upload_to_figshare(token, title, directory, project_id, publish, version, article_id=None):
1111
"""
1212
Uploads a file to Figshare and publishes the article.
1313
@@ -187,34 +187,41 @@ def delete_existing_file(article_id, file_id):
187187
issue_request('DELETE', f'account/articles/{article_id}/files/{file_id}')
188188

189189

190-
def write_figshare_details_to_yaml(article_id, project_id, title):
190+
def write_figshare_details_to_yaml(article_id, project_id, title, version):
191191
"""
192-
Write details of Figshare to yaml
192+
Write details of Figshare to two yaml files.
193+
figshare_latest.yml will be updated for the docs.
194+
dataset.yml will be updated for the python package.
193195
"""
194-
#convert slashes and periods to underscores so the file links are generated correctly.
196+
197+
# update dataset.yml
198+
with open("coderdata/dataset.yml", "r") as f:
199+
data = yaml.safe_load(f)
200+
data["figshare"] = f"https://api.figshare.com/v2/articles/{article_id}"
201+
data["version"] = version
202+
with open("tmp/dataset.yml", "w") as f:
203+
yaml.safe_dump(data, f, sort_keys=False)
204+
205+
206+
# write figshare_latest.yml
195207
title_updated = title.replace('/', '_')
196208
title_updated = title_updated.replace('.', '_')
197209
article_info = issue_request('GET', f'articles/{article_id}')
198-
# article_link = f"https://figshare.com/articles/dataset/{title}/{project_id}/file/{article_id}"
199210
article_link = f"https://figshare.com/articles/dataset/{title_updated}/{article_id}"
200211

201212
# Retrieve the article details
202213
article_details_response = requests.get(article_info['url'])
203214
article_details_response.raise_for_status()
204215
article_details = article_details_response.json()
205-
206-
# Construct the URLs
207-
file_url_links = {file['name']:f"https://figshare.com/articles/dataset/{title_updated}/{article_id}?file={file['id']}" for file in article_details['files']}
208-
file_download_link = {file['name']: file['download_url'] for file in article_details['files']}
216+
209217
yaml_data = {
210218
'article_link': article_link,
211-
'file_url': file_url_links,
212-
'file_download': file_download_link
219+
'version': version,
213220
}
214221

215222
with open('/tmp/figshare_latest.yml', 'w') as file:
216223
yaml.dump(yaml_data, file, default_flow_style=False)
217-
224+
218225

219226
article_id = create_or_get_article(title, project_id, article_id)
220227
all_files_uploaded = True
@@ -249,7 +256,7 @@ def write_figshare_details_to_yaml(article_id, project_id, title):
249256
print("Files uploaded successfully but not published.")
250257

251258
if all_files_uploaded:
252-
write_figshare_details_to_yaml(article_id, project_id,title)
259+
write_figshare_details_to_yaml(article_id, project_id,title, version)
253260

254261
def main():
255262
parser = argparse.ArgumentParser(description='Upload files to Figshare.')
@@ -259,9 +266,10 @@ def main():
259266
parser.add_argument('-p', '--publish', help='Publish the article', action='store_true')
260267
parser.add_argument('-j', '--project_id', help='Existing Figshare project ID', required=True)
261268
parser.add_argument('-a', '--article_id', help='Existing Figshare article ID', required=False, default=None)
269+
parser.add_argument('-v', '--version', help='Latest Version', required=True)
262270
args = parser.parse_args()
263271

264-
upload_to_figshare(args.token, args.title, args.directory, args.project_id, args.publish, args.article_id)
272+
upload_to_figshare(args.token, args.title, args.directory, args.project_id, args.publish, args.version, args.article_id)
265273

266274
if __name__ == "__main__":
267275
main()

0 commit comments

Comments
 (0)