44import gc
55import pandas as pd
66import numpy as np
7+ from shutil import copytree , rmtree
78
89from xenium_analysis_tools .utils .io_utils import (
910 atomic_write_sdata ,
@@ -36,29 +37,79 @@ def find_xenium_bundle(bundle_name, data_folder='/root/capsule/data'):
3637 path_to_bundle = found_dirs [0 ]
3738 break
3839 return path_to_bundle
40+
41+ def get_data_folder_slides (dataset_name , source_path , dest_path ):
42+ """Copy slide data from source to destination, handling incomplete files."""
43+ # Find slides
44+ slides = list (source_path .glob ('slide_*.zarr' ))
45+ if not slides :
46+ print (f"No slides found in { source_path } " )
47+ return
48+
49+ # Create destination directory
50+ dest_path .mkdir (parents = True , exist_ok = True )
51+
52+ # Copy slides
53+ for slide in slides :
54+ print (f"Checking { slide .name } ..." )
55+ dest_slide = dest_path / slide .name
56+
57+ # Skip if destination already complete
58+ if dest_slide .exists () and is_complete_store (dest_slide ):
59+ print (f"{ slide .name } already complete" )
60+ continue
61+
62+ # Only copy if source is valid
63+ if not is_complete_store (slide ):
64+ print (f"{ slide .name } source incomplete, skipping" )
65+ continue
66+
67+ # Remove incomplete destination and copy
68+ if dest_slide .exists ():
69+ rmtree (dest_slide )
70+
71+ copytree (slide , dest_slide )
72+ print (f"Copied { slide .name } " )
3973
4074def generate_slides (dataset_name : str , config_path : str = None , select_sections : list [int ]| None = None ):
4175 """
4276 Generate slide-level SpatialData objects from raw Xenium data bundles.
4377 """
4478 # ---- Set up ----
4579 config = load_config (config_path )
80+
81+ # Paths/directories
4682 paths = config ['paths' ]
4783 processing_config = config ['processing_control' ]
4884 raw_data_folder = Path (paths ['data_root' ]) / dataset_name
4985 save_sections_parent_folder = processing_config ['save_initial_data_parent_folder' ]
5086 save_sections_path = Path (paths [f'{ save_sections_parent_folder } _root' ]) / f"{ dataset_name } { processing_config ['save_initial_dataset_suffix' ]} "
5187 save_sections_path .mkdir (parents = True , exist_ok = True )
88+
89+ # Logger
5290 logger , log_file_path = setup_logging (save_sections_path )
91+
92+ # Print out where slides are being saved
5393 logger .info (f"Dataset Name: { dataset_name } " )
5494 logger .info (f"Configuration loaded from { config_path } " )
5595 logger .info (f"Raw data folder: { raw_data_folder } " )
5696 logger .info (f"Slides will be saved to: { save_sections_path } " )
97+
98+ # If specified, copy slides from data folder instead of re-generating
99+ if processing_config ['check_data_folder_slides' ]:
100+ logger .info ("Checking and copying slides from data folder if exist..." )
101+ data_folder_slides_path = Path (paths ['data_root' ]) / f'{ dataset_name } { processing_config ["save_initial_dataset_suffix" ]} '
102+ get_data_folder_slides (dataset_name , data_folder_slides_path , save_sections_path )
103+
104+ # Get the slides information
57105 sections_df = get_sections_df (raw_data_folder )
106+
58107 # Limit sections, if specified
59108 if select_sections is not None :
60109 logger .info (f"Limiting processing to sections: { select_sections } " )
61110 sections_df = sections_df [sections_df ['section' ].isin (select_sections )]
111+
112+ # Set up processing loop
62113 logger .info (f"Processing { len (sections_df )} sections from { sections_df ['slide_id' ].nunique ()} slide(s)" )
63114 unique_slides = sections_df .groupby ('slide_id' )
64115
@@ -68,15 +119,20 @@ def generate_slides(dataset_name: str, config_path: str=None, select_sections: l
68119 unit = "slide" ,
69120 total = len (unique_slides .groups .keys ())):
70121 try :
122+ # Get slide information
71123 group = unique_slides .get_group (slide_id )
72124 slide_row = group .iloc [0 ]
73125 raw_slide_path = raw_data_folder / slide_row ['dir' ]
74126 save_slide_path = save_sections_path / f"{ processing_config ['save_initial_dataset_prefix' ]} { slide_id } .zarr"
75127 logger .info (f"Processing slide { slide_id } ..." )
128+
129+ # Check if already generated
76130 if is_complete (save_slide_path , check_store = True ):
77131 logger .info (f"Slide { slide_id } already processed. Skipping." )
78132 continue
79133 logger .info (f"Generating SpatialData object for slide { slide_id } ..." )
134+
135+ # Make sure experiment file exists - if not, try to find alternative location
80136 if not (raw_slide_path / 'experiment.xenium' ).exists ():
81137 logger .info (f"Experiment file not found for slide { slide_id } at { raw_slide_path / 'experiment.xenium' } " )
82138 logger .info (f"Looking for alternative experiment file..." )
@@ -87,6 +143,8 @@ def generate_slides(dataset_name: str, config_path: str=None, select_sections: l
87143 else :
88144 logger .error (f"Could not find experiment file for slide { slide_id } . Skipping." )
89145 continue
146+
147+ # Read Xenium slide and save
90148 logger .info (f"Reading Xenium bundle: { raw_slide_path } " )
91149 sdata_reader_params = config .get ('sdata_reader_params' , {})
92150 if sdata_reader_params .get ('n_jobs' ) == "max" : sdata_reader_params ['n_jobs' ] = os .cpu_count ()
0 commit comments