Skip to content

Commit 4b4e258

Browse files
committed
Added the /align_drug_descriptors.py file and changes requried to build_all.py
1 parent 004b730 commit 4b4e258

2 files changed

Lines changed: 102 additions & 1 deletion

File tree

build/build_all.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ def run_docker_upload_cmd(cmd_arr, all_files_dir, name, version):
273273
docker_run.extend(['upload'])
274274
if 'FIGSHARE_TOKEN' in env and name == 'Figshare':
275275
docker_run.extend(['-e', f"FIGSHARE_TOKEN={env['FIGSHARE_TOKEN']}", 'upload'])
276-
if name == "Map_Drugs" or name == "Map_Samples":
276+
if name in ["Map_Drugs", "Map_Samples", "Align_Drug_Descriptors"]:
277277
docker_run.extend(['upload'])
278278
if 'GITHUB_TOKEN' in env and name == "GitHub":
279279
docker_run.extend(['-e', f"GITHUB_TOKEN={env['GITHUB_TOKEN']}", 'upload'])
@@ -445,6 +445,9 @@ def get_latest_commit_hash(owner, repo, branch='main'):
445445

446446
drug_mapping_command = ['python3', 'scripts/map_improve_drug_ids.py', '--local_dir', "/tmp", '--version', args.version]
447447
run_docker_upload_cmd(drug_mapping_command, 'all_files_dir', 'Map_Drugs', args.version)
448+
449+
drug_mapping_command_2 = ['python3', 'scripts/align_drug_descriptors.py', '--local_dir', "/tmp", '--version', args.version]
450+
run_docker_upload_cmd(drug_mapping_command_2, 'all_files_dir', 'Align_Drug_Descriptors', args.version)
448451

449452
# Run schema checker - This will always run if uploading data.
450453
schema_check_command = ['python3', 'scripts/check_schema.py', '--datasets'] + datasets

scripts/align_drug_descriptors.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
#!/usr/bin/env python3
2+
import os
3+
import gzip
4+
import shutil
5+
import csv
6+
import argparse
7+
8+
# Helper scripts
9+
def decompress_gz_if_needed(path):
10+
"""If path ends with .gz, decompress to a temp file and return its name plus True."""
11+
if path.endswith('.gz'):
12+
out = path[:-3]
13+
with gzip.open(path, 'rb') as f_in, open(out, 'wb') as f_out:
14+
shutil.copyfileobj(f_in, f_out)
15+
return out, True
16+
return path, False
17+
18+
def recompress_if_needed(decompressed, was_gz, original):
19+
"""If was_gz, recompress decompressed back to original and remove decompressed."""
20+
if was_gz:
21+
with open(decompressed, 'rb') as f_in, gzip.open(original, 'wb') as f_out:
22+
shutil.copyfileobj(f_in, f_out)
23+
os.remove(decompressed)
24+
25+
def find_descriptor_files(directory):
26+
"""Find all *_drug_descriptors.tsv and .tsv.gz files in directory."""
27+
files = []
28+
for fn in os.listdir(directory):
29+
if fn.endswith('_drug_descriptors.tsv') or fn.endswith('_drug_descriptors.tsv.gz'):
30+
files.append(os.path.join(directory, fn))
31+
return sorted(files)
32+
33+
34+
# Actual work
35+
def build_reference_map(files):
36+
"""Build map from (drug_id, descriptor) with first found value."""
37+
ref = {}
38+
for fp in files:
39+
path, gz = decompress_gz_if_needed(fp)
40+
with open(path, newline='', encoding='utf-8') as f:
41+
reader = csv.DictReader(f, delimiter='\t')
42+
for row in reader:
43+
key = (row['improve_drug_id'], row['structural_descriptor'])
44+
val = row['descriptor_value']
45+
if key not in ref:
46+
ref[key] = val
47+
recompress_if_needed(path, gz, fp)
48+
return ref
49+
50+
def rewrite_files(files, ref):
51+
"""Go back and rewrite any mismatches in-place."""
52+
for fp in files:
53+
path, gz = decompress_gz_if_needed(fp)
54+
tmp = path + '.tmp'
55+
changed = False
56+
57+
with open(path, newline='', encoding='utf-8') as fin, \
58+
open(tmp, 'w', newline='', encoding='utf-8') as fout:
59+
60+
reader = csv.DictReader(fin, delimiter='\t')
61+
writer = csv.DictWriter(fout, fieldnames=reader.fieldnames, delimiter='\t')
62+
writer.writeheader()
63+
64+
for row in reader:
65+
key = (row['improve_drug_id'], row['structural_descriptor'])
66+
correct = ref.get(key)
67+
if correct is not None and row['descriptor_value'] != correct:
68+
print(f"Fixing {key} in {os.path.basename(fp)}: "
69+
f"{row['descriptor_value']} to {correct}")
70+
row['descriptor_value'] = correct
71+
changed = True
72+
writer.writerow(row)
73+
74+
if changed:
75+
os.replace(tmp, path)
76+
else:
77+
os.remove(tmp)
78+
79+
recompress_if_needed(path, gz, fp)
80+
81+
def main():
82+
parser = argparse.ArgumentParser(
83+
description="Harmonize drug_descriptor values across multiple files."
84+
)
85+
parser.add_argument('--local_dir', default='.', help='Folder containing *_drug_descriptors.tsv[.gz]')
86+
parser.add_argument('--version', help=argparse.SUPPRESS) # ignore the version input
87+
args = parser.parse_args()
88+
files = find_descriptor_files(args.local_dir)
89+
if not files:
90+
print("No drug_descriptor files found in", args.local_dir)
91+
return
92+
93+
ref = build_reference_map(files)
94+
rewrite_files(files, ref)
95+
print("Done.")
96+
97+
if __name__ == '__main__':
98+
main()

0 commit comments

Comments
 (0)