1+ from pathlib import Path
2+ import tqdm
3+ import pandas as pd
4+ import numpy as np
5+ import spatialdata as sd
6+
17from xenium_analysis_tools .utils .io_utils import (
28 atomic_write_sdata ,
39 is_complete ,
410 is_complete_store ,
511 load_config ,
612 setup_logging ,
7- get_sections_df
13+ get_sections_df ,
14+ get_partial_dataset ,
15+ find_xenium_bundle
816)
917from xenium_analysis_tools .process_xenium .process_spatialdata import (
1018 process_metadata ,
1826from xenium_analysis_tools .process_xenium .validate_sections import (
1927 plot_section_bboxes
2028)
21- from pathlib import Path
22- import tqdm
23- import pandas as pd
24- import numpy as np
25- import spatialdata as sd
2629
2730def process_slides (dataset_name : str , config_path : str = None , select_sections : list [int ]| None = None , slides_parent_folder = 'data' ):
2831 """
@@ -31,63 +34,69 @@ def process_slides(dataset_name: str, config_path: str=None, select_sections: li
3134
3235 # ---- Set up ----
3336 config = load_config (config_path )
37+
38+ # Paths/directories
3439 paths = config ['paths' ]
3540 processing_config = config ['processing_control' ]
3641 raw_data_folder = Path (paths ['data_root' ]) / dataset_name
3742 slide_sd_path = Path (paths [f'{ slides_parent_folder } _root' ]) / f"{ dataset_name } { processing_config ['save_initial_dataset_suffix' ]} "
3843 save_sections_parent_folder = processing_config ['save_processed_data_parent_folder' ]
3944 save_sections_path = Path (paths [f'{ save_sections_parent_folder } _root' ]) / f"{ dataset_name } { processing_config ['save_processed_dataset_suffix' ]} "
4045 save_sections_path .mkdir (parents = True , exist_ok = True )
46+
47+ # Logger
4148 logger , log_file_path = setup_logging (save_sections_path )
49+
50+ # Print out where sections are being saved
4251 logger .info (f"Dataset Name: { dataset_name } " )
4352 logger .info (f"Configuration loaded from { config_path } " )
4453 logger .info (f"Raw data folder: { raw_data_folder } " )
4554 logger .info (f"Slides are being loaded from: { slide_sd_path } " )
4655 logger .info (f"Processed sections will be saved to: { save_sections_path } " )
56+
57+ # If specified, copy sections from data folder instead of re-generating
58+ if processing_config ['check_data_folder_slides' ]:
59+ logger .info ("Checking and copying slides from data folder if exist..." )
60+ data_folder_slides_path = Path (paths ['data_root' ]) / f'{ dataset_name } { processing_config ["save_processed_dataset_suffix" ]} '
61+ get_partial_dataset (data_folder_slides_path , save_sections_path , pattern = 'slide_*' , subset_ids = select_sections )
62+
63+ # Get the slides information
4764 sections_df = get_sections_df (raw_data_folder )
65+
4866 # Limit sections, if specified
4967 if select_sections is not None :
5068 logger .info (f"Limiting processing to sections: { select_sections } " )
5169 sections_df = sections_df [sections_df ['section' ].isin (select_sections )]
70+
71+ # Set up processing loop
5272 logger .info (f"Total slides found: { len (sections_df )} " )
5373 unique_slides = sections_df .groupby ('slide_id' )
5474
55- # ---- (Optional) Move data from a data asset to results folder ----
56- # Load partially processed sections from data asset and save to results to further process
57- if config ['processing_control' ].get ('load_processed_from_asset' , False ):
58- logger .info (f"Looking for processed sections in a data asset..." )
59- dataset_asset_folder = Path (config ['paths' ]['data_root' ]) / f"{ dataset_name } { config ['processing_control' ]['save_processed_dataset_suffix' ]} "
60- if dataset_asset_folder .exists ():
61- logger .info (f"Loading processed sections from data asset at { dataset_asset_folder } " )
62- sections_in_folder = list (dataset_asset_folder .glob ('section_*.zarr' ))
63- for section_zarr_path in tqdm .tqdm (sections_in_folder , desc = "Moving processed sections from asset" ):
64- section_save_path = save_sections_path / section_zarr_path .name
65- if section_save_path .exists () and is_complete_store (section_save_path ):
66- logger .info (f"Section { section_zarr_path .name } already exists in results folder. Skipping." )
67- continue
68- logger .info (f"Loading section { section_zarr_path .stem } from asset and saving to results folder..." )
69- try :
70- sdata = sd .read_zarr (section_zarr_path )
71- atomic_write_sdata (sdata , section_save_path , overwrite = True )
72- del sdata
73- except Exception as e :
74- logger .error (f"Error loading section { section_zarr_path .name } from asset: { e } " )
75- continue
76- else :
77- logger .info (f"No processed data asset found at { dataset_asset_folder } . Continuing without loading from asset." )
78-
7975 # ---- Run processing ----
80- logger .info (f"Starting processing for dataset: { dataset_name } " )
81- unique_slides = sections_df .groupby ('slide_id' )
82- for slide_id in tqdm .tqdm (unique_slides .groups .keys (), desc = "Processing slides" , unit = "slide" , total = len (unique_slides .groups .keys ())):
76+ for slide_id in tqdm .tqdm (unique_slides .groups .keys (),
77+ desc = "Processing slides" ,
78+ unit = "slide" ,
79+ total = len (unique_slides .groups .keys ())):
80+
81+ # Get slide information
8382 group = unique_slides .get_group (slide_id )
8483 slide_row = group .iloc [0 ]
8584 raw_slide_path = raw_data_folder / slide_row ['dir' ]
85+
86+ # Make sure Xenium bundle is valid, or find alternative location
87+ if not (raw_slide_path / 'experiment.xenium' ).exists ():
88+ xenium_bundle_path = find_xenium_bundle (raw_slide_path .name , data_folder = paths ['data_root' ])
89+ if xenium_bundle_path is not None :
90+ logger .info (f"experiment.xenium not found in { raw_slide_path } . Using found bundle at { xenium_bundle_path } " )
91+ raw_slide_path = xenium_bundle_path
92+
8693 slide_sdata_path = slide_sd_path / f"{ processing_config ['save_initial_dataset_prefix' ]} { slide_id } .zarr"
8794 if not slide_sdata_path .exists ():
8895 logger .warning (f"Slide data not found for slide { slide_id } at { slide_sdata_path } ! Skipping." )
8996 continue
9097 slide_sections = slide_row ['slide_sections' ]
98+
99+ # Check which sections need processing
91100 process_sections = []
92101 for section in slide_sections :
93102 section_zarr = f"{ processing_config ['save_processed_dataset_prefix' ]} { section } .zarr"
@@ -99,10 +108,10 @@ def process_slides(dataset_name: str, config_path: str=None, select_sections: li
99108 if not process_sections :
100109 logger .info (f"All sections for slide { slide_id } are already processed. Skipping slide." )
101110 continue
102- logger .info (f"Processing slide { slide_id } sections: { [str (s ) for s in process_sections ]} " )
103111
104112 # --- Process slide SpatialData ---
105113 # Load slide SpatialData
114+ logger .info (f"Processing slide { slide_id } sections: { [str (s ) for s in process_sections ]} " )
106115 slide_sdata = sd .read_zarr (slide_sdata_path )
107116
108117 # Get additional metadata from raw data xenium bundle
0 commit comments