Skip to content

Commit 8aa378d

Browse files
committed
Now will recognize tsvs (for experiments)
1 parent 087493c commit 8aa378d

1 file changed

Lines changed: 50 additions & 8 deletions

File tree

scripts/map_improve_sample_ids.py

Lines changed: 50 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -304,65 +304,107 @@ def rewrite_samples_file(file_path, sample_id_mapping, datasets=None, dataset=No
304304

305305
#### Rewrite all other files based on Stable IDs.
306306

307+
def _get_file_extension_ignore_gz(file_path):
308+
"""
309+
Extract the underlying extension, ignoring a trailing .gz if present.
310+
For example:
311+
- "data.csv.gz" -> ".csv"
312+
- "data.tsv.gz" -> ".tsv"
313+
- "data.csv" -> ".csv"
314+
- "data.tsv" -> ".tsv"
315+
"""
316+
file_path_lower = file_path.lower()
317+
if file_path_lower.endswith('.gz'):
318+
file_path_lower = file_path_lower[:-3] # strip .gz
319+
_, ext = os.path.splitext(file_path_lower)
320+
return ext
321+
307322
def rewrite_other_file(file_path, sample_id_mapping, datasets=None, dataset=None):
308323
"""
309324
Rewrites other files (e.g., transcriptomics, proteomics):
310-
- Replace improve_sample_id with stable_id based on sample_id_mapping
311-
- If the file is part of the specified datasets
325+
- Replace 'improve_sample_id' with stable_id based on sample_id_mapping
326+
- Skips if the file is not in the specified datasets
327+
- Handles CSV, TSV, and gzipped versions of these files
312328
"""
329+
# Determine whether to process this file
313330
if datasets and not any(ds in os.path.basename(file_path) for ds in datasets):
314331
return
315332
if dataset is None:
316333
print(f"Dataset not specified for {file_path}. Skipping.")
317334
return
335+
336+
# Figure out the underlying extension (ignoring .gz) to determine delimiter
337+
actual_ext = _get_file_extension_ignore_gz(file_path)
338+
if actual_ext == '.tsv':
339+
delim = '\t'
340+
else:
341+
# Default to comma for .csv or unknown extensions
342+
delim = ','
343+
318344
print(f"Rewriting other file: {file_path}")
345+
# Decompress if needed (returns unzipped path plus a flag indicating if it was gzipped)
319346
file_path, was_gz = decompress_gz_if_needed(file_path)
320347
if not os.path.exists(file_path):
348+
# If decompressed file doesn’t exist or is empty, re-compress (if needed) and skip
321349
recompress_if_needed(file_path, file_path, was_gz)
322350
print(f"File not found or empty after decompression: {file_path}")
323351
return
324-
with open(file_path,'r',newline='',encoding='utf-8') as f:
325-
reader = csv.reader(f)
352+
353+
with open(file_path, 'r', newline='', encoding='utf-8') as f:
354+
reader = csv.reader(f, delimiter=delim)
326355
rows = list(reader)
327356
if not rows:
328357
recompress_if_needed(file_path, file_path, was_gz)
329358
print(f"Empty file: {file_path}")
330359
return
360+
331361
header = rows[0]
332362
if "improve_sample_id" not in header:
333363
recompress_if_needed(file_path, file_path, was_gz)
334364
print(f"'improve_sample_id' column not found in {file_path}")
335365
return
366+
336367
try:
337368
idx_id = header.index("improve_sample_id")
338369
except ValueError:
339370
recompress_if_needed(file_path, file_path, was_gz)
340371
print(f"'improve_sample_id' column index error in {file_path}")
341372
return
342373
tmp = file_path + ".tmp"
343-
with open(file_path,'r',newline='',encoding='utf-8') as fin, \
344-
open(tmp,'w',newline='',encoding='utf-8') as fout:
345-
reader = csv.reader(fin)
346-
writer = csv.writer(fout)
374+
375+
with open(file_path, 'r', newline='', encoding='utf-8') as fin, \
376+
open(tmp, 'w', newline='', encoding='utf-8') as fout:
377+
reader = csv.reader(fin, delimiter=delim)
378+
writer = csv.writer(fout, delimiter=delim)
347379
hdr = next(reader)
348380
writer.writerow(hdr)
349381
for row in reader:
350382
if len(row) <= idx_id:
351383
writer.writerow(row)
352384
continue
385+
353386
original_id = row[idx_id].strip()
354387
mapping_key = (dataset, original_id)
355388
if mapping_key in sample_id_mapping:
356389
new_id = sample_id_mapping[mapping_key]
357390
if new_id != original_id:
358391
print(f"Replacing improve_sample_id '{original_id}' with stable_id '{new_id}' in {file_path}")
359392
row[idx_id] = new_id
393+
360394
writer.writerow(row)
395+
396+
# Replace original file with updated file
361397
os.replace(tmp, file_path)
398+
# Recompress if needed
362399
recompress_if_needed(file_path, file_path, was_gz)
363400

401+
402+
403+
364404
#### Call everything in Main
365405

406+
407+
366408
def main():
367409
parser = argparse.ArgumentParser(description="""
368410
Use quadruplet overlaps to assign stable IDs across builds.

0 commit comments

Comments
 (0)