Skip to content

Commit d2b26af

Browse files
committed
updated processing sections functions
1 parent f9f67aa commit d2b26af

3 files changed

Lines changed: 65 additions & 56 deletions

File tree

src/xenium_analysis_tools/process_xenium/generate_dataset_slides.py

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,28 +15,6 @@
1515
get_partial_dataset
1616
)
1717
from xenium_analysis_tools.process_xenium.process_spatialdata import read_xenium_slide
18-
19-
def find_xenium_bundle(bundle_name, data_folder='/root/capsule/data'):
20-
data_folder = Path(data_folder)
21-
search_paths = [
22-
data_folder / 'xenium_data',
23-
data_folder / 'Xenium_output_pilot'
24-
]
25-
search_paths = [path for path in search_paths if path.exists()]
26-
all_dirs = np.concatenate([list(folder.iterdir()) for folder in search_paths])
27-
output_folders = np.concatenate([list(folder.glob('output-*')) for folder in search_paths])
28-
subfolders = np.setdiff1d(all_dirs, output_folders)
29-
path_to_bundle = None
30-
found_dirs = [dir for dir in output_folders if dir.name == bundle_name]
31-
if found_dirs:
32-
path_to_bundle = found_dirs[0]
33-
else:
34-
for sub in subfolders:
35-
found_dirs = [dir for dir in list(sub.iterdir()) if dir.name == bundle_name]
36-
if found_dirs:
37-
path_to_bundle = found_dirs[0]
38-
break
39-
return path_to_bundle
4018

4119
def generate_slides(dataset_name: str, config_path: str=None, select_sections: list[int]|None = None):
4220
"""

src/xenium_analysis_tools/process_xenium/process_dataset_slides.py

Lines changed: 43 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,18 @@
1+
from pathlib import Path
2+
import tqdm
3+
import pandas as pd
4+
import numpy as np
5+
import spatialdata as sd
6+
17
from xenium_analysis_tools.utils.io_utils import (
28
atomic_write_sdata,
39
is_complete,
410
is_complete_store,
511
load_config,
612
setup_logging,
7-
get_sections_df
13+
get_sections_df,
14+
get_partial_dataset,
15+
find_xenium_bundle
816
)
917
from xenium_analysis_tools.process_xenium.process_spatialdata import (
1018
process_metadata,
@@ -18,11 +26,6 @@
1826
from xenium_analysis_tools.process_xenium.validate_sections import (
1927
plot_section_bboxes
2028
)
21-
from pathlib import Path
22-
import tqdm
23-
import pandas as pd
24-
import numpy as np
25-
import spatialdata as sd
2629

2730
def process_slides(dataset_name: str, config_path: str=None, select_sections: list[int]|None = None, slides_parent_folder='data'):
2831
"""
@@ -31,63 +34,69 @@ def process_slides(dataset_name: str, config_path: str=None, select_sections: li
3134

3235
# ---- Set up ----
3336
config = load_config(config_path)
37+
38+
# Paths/directories
3439
paths = config['paths']
3540
processing_config = config['processing_control']
3641
raw_data_folder = Path(paths['data_root']) / dataset_name
3742
slide_sd_path = Path(paths[f'{slides_parent_folder}_root']) / f"{dataset_name}{processing_config['save_initial_dataset_suffix']}"
3843
save_sections_parent_folder = processing_config['save_processed_data_parent_folder']
3944
save_sections_path = Path(paths[f'{save_sections_parent_folder}_root']) / f"{dataset_name}{processing_config['save_processed_dataset_suffix']}"
4045
save_sections_path.mkdir(parents=True, exist_ok=True)
46+
47+
# Logger
4148
logger, log_file_path = setup_logging(save_sections_path)
49+
50+
# Print out where sections are being saved
4251
logger.info(f"Dataset Name: {dataset_name}")
4352
logger.info(f"Configuration loaded from {config_path}")
4453
logger.info(f"Raw data folder: {raw_data_folder}")
4554
logger.info(f"Slides are being loaded from: {slide_sd_path}")
4655
logger.info(f"Processed sections will be saved to: {save_sections_path}")
56+
57+
# If specified, copy sections from data folder instead of re-generating
58+
if processing_config['check_data_folder_slides']:
59+
logger.info("Checking and copying slides from data folder if exist...")
60+
data_folder_slides_path = Path(paths['data_root']) / f'{dataset_name}{processing_config["save_processed_dataset_suffix"]}'
61+
get_partial_dataset(data_folder_slides_path, save_sections_path, pattern='slide_*', subset_ids=select_sections)
62+
63+
# Get the slides information
4764
sections_df = get_sections_df(raw_data_folder)
65+
4866
# Limit sections, if specified
4967
if select_sections is not None:
5068
logger.info(f"Limiting processing to sections: {select_sections}")
5169
sections_df = sections_df[sections_df['section'].isin(select_sections)]
70+
71+
# Set up processing loop
5272
logger.info(f"Total slides found: {len(sections_df)}")
5373
unique_slides = sections_df.groupby('slide_id')
5474

55-
# ---- (Optional) Move data from a data asset to results folder ----
56-
# Load partially processed sections from data asset and save to results to further process
57-
if config['processing_control'].get('load_processed_from_asset', False):
58-
logger.info(f"Looking for processed sections in a data asset...")
59-
dataset_asset_folder = Path(config['paths']['data_root']) / f"{dataset_name}{config['processing_control']['save_processed_dataset_suffix']}"
60-
if dataset_asset_folder.exists():
61-
logger.info(f"Loading processed sections from data asset at {dataset_asset_folder}")
62-
sections_in_folder = list(dataset_asset_folder.glob('section_*.zarr'))
63-
for section_zarr_path in tqdm.tqdm(sections_in_folder, desc="Moving processed sections from asset"):
64-
section_save_path = save_sections_path / section_zarr_path.name
65-
if section_save_path.exists() and is_complete_store(section_save_path):
66-
logger.info(f"Section {section_zarr_path.name} already exists in results folder. Skipping.")
67-
continue
68-
logger.info(f"Loading section {section_zarr_path.stem} from asset and saving to results folder...")
69-
try:
70-
sdata = sd.read_zarr(section_zarr_path)
71-
atomic_write_sdata(sdata, section_save_path, overwrite=True)
72-
del sdata
73-
except Exception as e:
74-
logger.error(f"Error loading section {section_zarr_path.name} from asset: {e}")
75-
continue
76-
else:
77-
logger.info(f"No processed data asset found at {dataset_asset_folder}. Continuing without loading from asset.")
78-
7975
# ---- Run processing ----
80-
logger.info(f"Starting processing for dataset: {dataset_name}")
81-
unique_slides = sections_df.groupby('slide_id')
82-
for slide_id in tqdm.tqdm(unique_slides.groups.keys(), desc="Processing slides", unit="slide", total=len(unique_slides.groups.keys())):
76+
for slide_id in tqdm.tqdm(unique_slides.groups.keys(),
77+
desc="Processing slides",
78+
unit="slide",
79+
total=len(unique_slides.groups.keys())):
80+
81+
# Get slide information
8382
group = unique_slides.get_group(slide_id)
8483
slide_row = group.iloc[0]
8584
raw_slide_path = raw_data_folder / slide_row['dir']
85+
86+
# Make sure Xenium bundle is valid, or find alternative location
87+
if not (raw_slide_path / 'experiment.xenium').exists():
88+
xenium_bundle_path = find_xenium_bundle(raw_slide_path.name, data_folder=paths['data_root'])
89+
if xenium_bundle_path is not None:
90+
logger.info(f"experiment.xenium not found in {raw_slide_path}. Using found bundle at {xenium_bundle_path}")
91+
raw_slide_path = xenium_bundle_path
92+
8693
slide_sdata_path = slide_sd_path / f"{processing_config['save_initial_dataset_prefix']}{slide_id}.zarr"
8794
if not slide_sdata_path.exists():
8895
logger.warning(f"Slide data not found for slide {slide_id} at {slide_sdata_path}! Skipping.")
8996
continue
9097
slide_sections = slide_row['slide_sections']
98+
99+
# Check which sections need processing
91100
process_sections = []
92101
for section in slide_sections:
93102
section_zarr = f"{processing_config['save_processed_dataset_prefix']}{section}.zarr"
@@ -99,10 +108,10 @@ def process_slides(dataset_name: str, config_path: str=None, select_sections: li
99108
if not process_sections:
100109
logger.info(f"All sections for slide {slide_id} are already processed. Skipping slide.")
101110
continue
102-
logger.info(f"Processing slide {slide_id} sections: {[str(s) for s in process_sections]}")
103111

104112
# --- Process slide SpatialData ---
105113
# Load slide SpatialData
114+
logger.info(f"Processing slide {slide_id} sections: {[str(s) for s in process_sections]}")
106115
slide_sdata = sd.read_zarr(slide_sdata_path)
107116

108117
# Get additional metadata from raw data xenium bundle

src/xenium_analysis_tools/utils/io_utils.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,28 @@ def safe_copy_tree(src: Path, dst: Path):
143143

144144
shutil.copytree(src, dst)
145145

146+
def find_xenium_bundle(bundle_name, data_folder='/root/capsule/data'):
147+
data_folder = Path(data_folder)
148+
search_paths = [
149+
data_folder / 'xenium_data',
150+
data_folder / 'Xenium_output_pilot'
151+
]
152+
search_paths = [path for path in search_paths if path.exists()]
153+
all_dirs = np.concatenate([list(folder.iterdir()) for folder in search_paths])
154+
output_folders = np.concatenate([list(folder.glob('output-*')) for folder in search_paths])
155+
subfolders = np.setdiff1d(all_dirs, output_folders)
156+
path_to_bundle = None
157+
found_dirs = [dir for dir in output_folders if dir.name == bundle_name]
158+
if found_dirs:
159+
path_to_bundle = found_dirs[0]
160+
else:
161+
for sub in subfolders:
162+
found_dirs = [dir for dir in list(sub.iterdir()) if dir.name == bundle_name]
163+
if found_dirs:
164+
path_to_bundle = found_dirs[0]
165+
break
166+
return path_to_bundle
167+
146168
def get_partial_dataset(source_path, dest_path, pattern='section_*', subset_ids=None):
147169
"""Copy slide data from source to destination, handling incomplete files."""
148170
# Find matches

0 commit comments

Comments
 (0)