Skip to content

Commit 54227fc

Browse files
committed
Update for NCI60, bug fixes, and HCMI GDC tool
1 parent d0dc5ed commit 54227fc

4 files changed

Lines changed: 57 additions & 42 deletions

File tree

build/broad_sanger/04b-nci60-updated.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,10 +107,11 @@ def main():
107107

108108
finaldf = pl.DataFrame(
109109
{
110-
'source':['NCI60' for a in molar['improve_drug_id']], ##2024 build
110+
'source':['NCI60_24' for a in molar['improve_drug_id']], ##2024 build
111111
'improve_sample_id':molar['improve_sample_id'],
112112
'Drug':molar['improve_drug_id'],
113-
'study': molar['EXPID'],#['NCI60' for a in nonulls['improve_drug_id']],
113+
# 'study': molar['EXPID'],#['NCI60' for a in nonulls['improve_drug_id']],
114+
'study': "NCI60",
114115
'time':molar['time'],
115116
'time_unit':molar['time_unit'],
116117
'DOSE': [(10**a)*1000000 for a in molar['CONCENTRATION']], ##move from molar to uM to match pharmacoDB

build/build_dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ def run_docker_validate_cmd(cmd_arr, all_files_dir, name):
218218
Wrapper for 'docker run' command used during validation and uploads.
219219
'''
220220
env = os.environ.copy()
221-
docker_run = ['docker', 'run', '-v', f"{env['PWD']}/local/{all_files_dir}:/tmp"]
221+
docker_run = ['docker', 'run', '-v', f"{env['PWD']}/local/{all_files_dir}:/tmp", '--platform=linux/amd64']
222222
docker_run.extend(['upload'])
223223
docker_run.extend(cmd_arr)
224224
print('Executing:', ' '.join(docker_run))
@@ -258,7 +258,7 @@ def run_schema_checker(dataset):
258258
decompress_file(os.path.join('local', all_files_dir, file))
259259

260260
# Run schema checker
261-
schema_check_command = ['python3', 'scripts/check_schema.py', '--datasets'] + datasets
261+
schema_check_command = ['python3', 'check_schema.py', '--datasets'] + datasets
262262
run_docker_validate_cmd(schema_check_command, all_files_dir, 'Validation')
263263

264264
def main():

build/hcmi/02-getHCMIData.py

Lines changed: 51 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -17,79 +17,93 @@
1717
import polars as pl
1818
import gc
1919
import hashlib
20+
from pathlib import Path
2021

2122
def download_tool(url):
2223
"""
23-
Download, extract, and make a tool (GDC Client) executable from the provided URL.
24+
Download, extract, and prepare the GDC client tool.
2425
2526
Parameters
2627
----------
2728
url : str
28-
The URL from where the tool needs to be downloaded.
29+
The URL to download the tool from.
2930
3031
Returns
3132
-------
3233
str
33-
Name of the downloaded file.
34+
The path to the `gdc-client` executable.
3435
"""
35-
36+
# Download the file
37+
print("Downloading tool...")
3638
filename = wget.download(url)
37-
files_before = os.listdir()
38-
# shutil.unpack_archive(filename)
39-
40-
#This is just set for AWS to debug. This will have to be mapped to OS. They changed their file structure. This should be updated.
41-
shutil.unpack_archive("gdc-client_2.3_Ubuntu_x64.zip")
42-
if not os.path.exists('gdc-client'):
43-
raise FileNotFoundError("gdc-client executable not found after extraction.")
44-
# Ensure 'gdc-client' is executable
45-
st = os.stat('gdc-client')
46-
os.chmod('gdc-client', st.st_mode | stat.S_IEXEC)
47-
# Return the path to the executable
48-
return './gdc-client'
49-
50-
# files_after = os.listdir()
51-
# new_file = str(next(iter((set(files_after) - set(files_before)))))
52-
# st = os.stat(new_file)
53-
# os.chmod(new_file, st.st_mode | stat.S_IEXEC)
54-
# return filename
39+
40+
# First extraction
41+
print(f"\nExtracting {filename}...")
42+
shutil.unpack_archive(filename)
43+
os.remove(filename)
44+
45+
# Check for a nested zip file and extract again
46+
extracted_files = [f for f in os.listdir() if os.path.isfile(f) and f.endswith(".zip")]
47+
for zip_file in extracted_files:
48+
print(f"Extracting nested archive: {zip_file}...")
49+
shutil.unpack_archive(zip_file)
50+
os.remove(zip_file)
51+
52+
gdc_client_path = None
53+
for root, dirs, files in os.walk("."):
54+
if "gdc-client" in files:
55+
gdc_client_path = os.path.join(root, "gdc-client")
56+
break
57+
58+
if not gdc_client_path:
59+
raise FileNotFoundError("`gdc-client` executable not found after extraction.")
60+
61+
# Ensure `gdc-client` is executable
62+
print(f"Making {gdc_client_path} executable...")
63+
st = os.stat(gdc_client_path)
64+
os.chmod(gdc_client_path, st.st_mode | stat.S_IEXEC)
65+
66+
return gdc_client_path
5567

5668
def is_tool(name):
5769
"""
58-
Check if a specific tool is available on the system or in the current directory.
70+
Check if a specific tool is available on the system.
5971
6072
Parameters
6173
----------
6274
name : str
63-
The name of the tool to check.
75+
The name of the tool.
6476
6577
Returns
6678
-------
6779
bool
6880
True if the tool is found, otherwise False.
6981
"""
70-
71-
return which(name) is not None or name in os.listdir()
82+
return shutil.which(name) is not None or name in os.listdir()
7283

7384
def ensure_gdc_client():
7485
"""
75-
Ensure that the gdc-client is available on the system.
86+
Ensure that the GDC client tool is available on the system.
7687
77-
If the gdc-client tool isn't found, this function will automatically
78-
download the appropriate version based on the operating system.
88+
If the tool isn't found, this function downloads and prepares it.
7989
"""
80-
8190
tool_name = "gdc-client"
8291
if not is_tool(tool_name):
83-
print("Downloading gdc-client")
92+
print("GDC client not found. Downloading...")
8493
urls = {
85-
"Darwin": 'https://gdc.cancer.gov/system/files/public/file/gdc-client_2.3_OSX_x64-py3.8-macos-14.zip',
86-
"Windows": 'https://gdc.cancer.gov/system/files/public/file/gdc-client_2.3_Windows_x64-py3.8-windows-2019.zip',
87-
"Linux": 'https://gdc.cancer.gov/system/files/public/file/gdc-client_2.3_Ubuntu_x64-py3.8-ubuntu-20.04.zip'
94+
"Darwin": "https://gdc.cancer.gov/system/files/public/file/gdc-client_2.3_OSX_x64-py3.8-macos-14.zip",
95+
"Windows": "https://gdc.cancer.gov/system/files/public/file/gdc-client_2.3_Windows_x64-py3.8-windows-2019.zip",
96+
"Linux": "https://gdc.cancer.gov/system/files/public/file/gdc-client_2.3_Ubuntu_x64-py3.8-ubuntu-20.04.zip"
8897
}
89-
90-
download_tool(urls.get(platform.system()))
98+
os_type = platform.system()
99+
url = urls.get(os_type)
100+
if not url:
101+
raise ValueError(f"Unsupported OS: {os_type}")
102+
gdc_client_path = download_tool(url)
103+
print(f"`gdc-client` downloaded and available at {gdc_client_path}")
91104
else:
92-
print("gdc-client already installed")
105+
print("`gdc-client` is already installed.")
106+
93107

94108
def extract_uuids_from_manifest(manifest_data):
95109
"""

schema/expected_files.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ datasets:
3939
- target_class: Drug
4040
file: /tmp/mpnst_drugs.tsv
4141

42-
mpnst:
42+
mpnstpdx:
4343
- target_class: Sample
4444
file: /tmp/mpnstpdx_samples.csv
4545
- target_class: Transcriptomics

0 commit comments

Comments
 (0)