@@ -304,65 +304,107 @@ def rewrite_samples_file(file_path, sample_id_mapping, datasets=None, dataset=No
304304
305305#### Rewrite all other files based on Stable IDs.
306306
307+ def _get_file_extension_ignore_gz (file_path ):
308+ """
309+ Extract the underlying extension, ignoring a trailing .gz if present.
310+ For example:
311+ - "data.csv.gz" -> ".csv"
312+ - "data.tsv.gz" -> ".tsv"
313+ - "data.csv" -> ".csv"
314+ - "data.tsv" -> ".tsv"
315+ """
316+ file_path_lower = file_path .lower ()
317+ if file_path_lower .endswith ('.gz' ):
318+ file_path_lower = file_path_lower [:- 3 ] # strip .gz
319+ _ , ext = os .path .splitext (file_path_lower )
320+ return ext
321+
307322def rewrite_other_file (file_path , sample_id_mapping , datasets = None , dataset = None ):
308323 """
309324 Rewrites other files (e.g., transcriptomics, proteomics):
310- - Replace improve_sample_id with stable_id based on sample_id_mapping
311- - If the file is part of the specified datasets
325+ - Replace 'improve_sample_id' with stable_id based on sample_id_mapping
326+ - Skips if the file is not in the specified datasets
327+ - Handles CSV, TSV, and gzipped versions of these files
312328 """
329+ # Determine whether to process this file
313330 if datasets and not any (ds in os .path .basename (file_path ) for ds in datasets ):
314331 return
315332 if dataset is None :
316333 print (f"Dataset not specified for { file_path } . Skipping." )
317334 return
335+
336+ # Figure out the underlying extension (ignoring .gz) to determine delimiter
337+ actual_ext = _get_file_extension_ignore_gz (file_path )
338+ if actual_ext == '.tsv' :
339+ delim = '\t '
340+ else :
341+ # Default to comma for .csv or unknown extensions
342+ delim = ','
343+
318344 print (f"Rewriting other file: { file_path } " )
345+ # Decompress if needed (returns unzipped path plus a flag indicating if it was gzipped)
319346 file_path , was_gz = decompress_gz_if_needed (file_path )
320347 if not os .path .exists (file_path ):
348+ # If decompressed file doesn’t exist or is empty, re-compress (if needed) and skip
321349 recompress_if_needed (file_path , file_path , was_gz )
322350 print (f"File not found or empty after decompression: { file_path } " )
323351 return
324- with open (file_path ,'r' ,newline = '' ,encoding = 'utf-8' ) as f :
325- reader = csv .reader (f )
352+
353+ with open (file_path , 'r' , newline = '' , encoding = 'utf-8' ) as f :
354+ reader = csv .reader (f , delimiter = delim )
326355 rows = list (reader )
327356 if not rows :
328357 recompress_if_needed (file_path , file_path , was_gz )
329358 print (f"Empty file: { file_path } " )
330359 return
360+
331361 header = rows [0 ]
332362 if "improve_sample_id" not in header :
333363 recompress_if_needed (file_path , file_path , was_gz )
334364 print (f"'improve_sample_id' column not found in { file_path } " )
335365 return
366+
336367 try :
337368 idx_id = header .index ("improve_sample_id" )
338369 except ValueError :
339370 recompress_if_needed (file_path , file_path , was_gz )
340371 print (f"'improve_sample_id' column index error in { file_path } " )
341372 return
342373 tmp = file_path + ".tmp"
343- with open (file_path ,'r' ,newline = '' ,encoding = 'utf-8' ) as fin , \
344- open (tmp ,'w' ,newline = '' ,encoding = 'utf-8' ) as fout :
345- reader = csv .reader (fin )
346- writer = csv .writer (fout )
374+
375+ with open (file_path , 'r' , newline = '' , encoding = 'utf-8' ) as fin , \
376+ open (tmp , 'w' , newline = '' , encoding = 'utf-8' ) as fout :
377+ reader = csv .reader (fin , delimiter = delim )
378+ writer = csv .writer (fout , delimiter = delim )
347379 hdr = next (reader )
348380 writer .writerow (hdr )
349381 for row in reader :
350382 if len (row ) <= idx_id :
351383 writer .writerow (row )
352384 continue
385+
353386 original_id = row [idx_id ].strip ()
354387 mapping_key = (dataset , original_id )
355388 if mapping_key in sample_id_mapping :
356389 new_id = sample_id_mapping [mapping_key ]
357390 if new_id != original_id :
358391 print (f"Replacing improve_sample_id '{ original_id } ' with stable_id '{ new_id } ' in { file_path } " )
359392 row [idx_id ] = new_id
393+
360394 writer .writerow (row )
395+
396+ # Replace original file with updated file
361397 os .replace (tmp , file_path )
398+ # Recompress if needed
362399 recompress_if_needed (file_path , file_path , was_gz )
363400
401+
402+
403+
364404#### Call everything in Main
365405
406+
407+
366408def main ():
367409 parser = argparse .ArgumentParser (description = """
368410Use quadruplet overlaps to assign stable IDs across builds.
0 commit comments