Skip to content

Commit eaf4aab

Browse files
committed
added checking for partially processed dataset in data folder
1 parent 83ff1aa commit eaf4aab

1 file changed

Lines changed: 58 additions & 0 deletions

File tree

src/xenium_analysis_tools/process_xenium/generate_dataset_slides.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import gc
55
import pandas as pd
66
import numpy as np
7+
from shutil import copytree, rmtree
78

89
from xenium_analysis_tools.utils.io_utils import (
910
atomic_write_sdata,
@@ -36,29 +37,79 @@ def find_xenium_bundle(bundle_name, data_folder='/root/capsule/data'):
3637
path_to_bundle = found_dirs[0]
3738
break
3839
return path_to_bundle
40+
41+
def get_data_folder_slides(dataset_name, source_path, dest_path):
42+
"""Copy slide data from source to destination, handling incomplete files."""
43+
# Find slides
44+
slides = list(source_path.glob('slide_*.zarr'))
45+
if not slides:
46+
print(f"No slides found in {source_path}")
47+
return
48+
49+
# Create destination directory
50+
dest_path.mkdir(parents=True, exist_ok=True)
51+
52+
# Copy slides
53+
for slide in slides:
54+
print(f"Checking {slide.name}...")
55+
dest_slide = dest_path / slide.name
56+
57+
# Skip if destination already complete
58+
if dest_slide.exists() and is_complete_store(dest_slide):
59+
print(f"{slide.name} already complete")
60+
continue
61+
62+
# Only copy if source is valid
63+
if not is_complete_store(slide):
64+
print(f"{slide.name} source incomplete, skipping")
65+
continue
66+
67+
# Remove incomplete destination and copy
68+
if dest_slide.exists():
69+
rmtree(dest_slide)
70+
71+
copytree(slide, dest_slide)
72+
print(f"Copied {slide.name}")
3973

4074
def generate_slides(dataset_name: str, config_path: str=None, select_sections: list[int]|None = None):
4175
"""
4276
Generate slide-level SpatialData objects from raw Xenium data bundles.
4377
"""
4478
# ---- Set up ----
4579
config = load_config(config_path)
80+
81+
# Paths/directories
4682
paths = config['paths']
4783
processing_config = config['processing_control']
4884
raw_data_folder = Path(paths['data_root']) / dataset_name
4985
save_sections_parent_folder = processing_config['save_initial_data_parent_folder']
5086
save_sections_path = Path(paths[f'{save_sections_parent_folder}_root']) / f"{dataset_name}{processing_config['save_initial_dataset_suffix']}"
5187
save_sections_path.mkdir(parents=True, exist_ok=True)
88+
89+
# Logger
5290
logger, log_file_path = setup_logging(save_sections_path)
91+
92+
# Print out where slides are being saved
5393
logger.info(f"Dataset Name: {dataset_name}")
5494
logger.info(f"Configuration loaded from {config_path}")
5595
logger.info(f"Raw data folder: {raw_data_folder}")
5696
logger.info(f"Slides will be saved to: {save_sections_path}")
97+
98+
# If specified, copy slides from data folder instead of re-generating
99+
if processing_config['check_data_folder_slides']:
100+
logger.info("Checking and copying slides from data folder if exist...")
101+
data_folder_slides_path = Path(paths['data_root']) / f'{dataset_name}{processing_config["save_initial_dataset_suffix"]}'
102+
get_data_folder_slides(dataset_name, data_folder_slides_path, save_sections_path)
103+
104+
# Get the slides information
57105
sections_df = get_sections_df(raw_data_folder)
106+
58107
# Limit sections, if specified
59108
if select_sections is not None:
60109
logger.info(f"Limiting processing to sections: {select_sections}")
61110
sections_df = sections_df[sections_df['section'].isin(select_sections)]
111+
112+
# Set up processing loop
62113
logger.info(f"Processing {len(sections_df)} sections from {sections_df['slide_id'].nunique()} slide(s)")
63114
unique_slides = sections_df.groupby('slide_id')
64115

@@ -68,15 +119,20 @@ def generate_slides(dataset_name: str, config_path: str=None, select_sections: l
68119
unit="slide",
69120
total=len(unique_slides.groups.keys())):
70121
try:
122+
# Get slide information
71123
group = unique_slides.get_group(slide_id)
72124
slide_row = group.iloc[0]
73125
raw_slide_path = raw_data_folder / slide_row['dir']
74126
save_slide_path = save_sections_path / f"{processing_config['save_initial_dataset_prefix']}{slide_id}.zarr"
75127
logger.info(f"Processing slide {slide_id}...")
128+
129+
# Check if already generated
76130
if is_complete(save_slide_path, check_store=True):
77131
logger.info(f"Slide {slide_id} already processed. Skipping.")
78132
continue
79133
logger.info(f"Generating SpatialData object for slide {slide_id}...")
134+
135+
# Make sure experiment file exists - if not, try to find alternative location
80136
if not (raw_slide_path / 'experiment.xenium').exists():
81137
logger.info(f"Experiment file not found for slide {slide_id} at {raw_slide_path / 'experiment.xenium'}")
82138
logger.info(f"Looking for alternative experiment file...")
@@ -87,6 +143,8 @@ def generate_slides(dataset_name: str, config_path: str=None, select_sections: l
87143
else:
88144
logger.error(f"Could not find experiment file for slide {slide_id}. Skipping.")
89145
continue
146+
147+
# Read Xenium slide and save
90148
logger.info(f"Reading Xenium bundle: {raw_slide_path}")
91149
sdata_reader_params = config.get('sdata_reader_params', {})
92150
if sdata_reader_params.get('n_jobs') == "max": sdata_reader_params['n_jobs'] = os.cpu_count()

0 commit comments

Comments
 (0)