updated generate_slides to find partial datasets

hschryver · hschryver · commit f9f67aac197f · 2026-01-16T03:57:04.000Z
diff --git a/src/xenium_analysis_tools/process_xenium/generate_dataset_slides.py b/src/xenium_analysis_tools/process_xenium/generate_dataset_slides.py
@@ -4,15 +4,15 @@
 import gc 
 import pandas as pd
 import numpy as np
-from shutil import copytree, rmtree
 
 from xenium_analysis_tools.utils.io_utils import (
     atomic_write_sdata, 
     is_complete, 
     is_complete_store, 
     load_config, 
     setup_logging,
-    get_sections_df
+    get_sections_df,
+    get_partial_dataset
 )
 from xenium_analysis_tools.process_xenium.process_spatialdata import read_xenium_slide
 
@@ -37,39 +37,6 @@ def find_xenium_bundle(bundle_name, data_folder='/root/capsule/data'):
                 path_to_bundle = found_dirs[0]
                 break
     return path_to_bundle
-
-def get_data_folder_slides(dataset_name, source_path, dest_path):
-    """Copy slide data from source to destination, handling incomplete files."""
-    # Find slides
-    slides = list(source_path.glob('slide_*.zarr'))
-    if not slides:
-        print(f"No slides found in {source_path}")
-        return
-    
-    # Create destination directory
-    dest_path.mkdir(parents=True, exist_ok=True)
-    
-    # Copy slides
-    for slide in slides:
-        print(f"Checking {slide.name}...")
-        dest_slide = dest_path / slide.name
-        
-        # Skip if destination already complete
-        if dest_slide.exists() and is_complete_store(dest_slide):
-            print(f"{slide.name} already complete")
-            continue
-        
-        # Only copy if source is valid
-        if not is_complete_store(slide):
-            print(f"{slide.name} source incomplete, skipping")
-            continue
-        
-        # Remove incomplete destination and copy
-        if dest_slide.exists():
-            rmtree(dest_slide)
-            
-        copytree(slide, dest_slide)
-        print(f"Copied {slide.name}")
     
 def generate_slides(dataset_name: str, config_path: str=None, select_sections: list[int]|None = None):
     """
@@ -99,8 +66,8 @@ def generate_slides(dataset_name: str, config_path: str=None, select_sections: l
     if processing_config['check_data_folder_slides']:
         logger.info("Checking and copying slides from data folder if exist...")
         data_folder_slides_path = Path(paths['data_root']) / f'{dataset_name}{processing_config["save_initial_dataset_suffix"]}'
-        get_data_folder_slides(dataset_name, data_folder_slides_path, save_sections_path)
-    
+        get_partial_dataset(data_folder_slides_path, save_sections_path, pattern='slide_*', subset_ids=select_sections)
+
     # Get the slides information
     sections_df = get_sections_df(raw_data_folder)
 
diff --git a/src/xenium_analysis_tools/utils/io_utils.py b/src/xenium_analysis_tools/utils/io_utils.py
@@ -5,6 +5,7 @@
 import logging
 import sys
 import pandas as pd
+from shutil import copytree, rmtree
 
 def load_config(config_path=None):
     if config_path is not None:
@@ -140,4 +141,48 @@ def safe_copy_tree(src: Path, dst: Path):
             return
         shutil.rmtree(dst)
     
-    shutil.copytree(src, dst)
+    shutil.copytree(src, dst)
+
+def get_partial_dataset(source_path, dest_path, pattern='section_*', subset_ids=None):
+    """Copy slide data from source to destination, handling incomplete files."""
+    # Find matches
+    all_matches = list(source_path.glob(pattern))
+
+    # Filter matches to only include sections in subset_ids
+    if subset_ids is not None:
+        matches = []
+        for m in all_matches:
+            section_ids = m.stem.split('_')[1:]
+            if any(int(sid) in subset_ids for sid in section_ids):
+                matches.append(m)
+    else:
+        matches = all_matches
+
+    if not matches:
+        print(f"No matches found in {source_path}")
+        return
+    
+    # Create destination directory
+    dest_path.mkdir(parents=True, exist_ok=True)
+    
+    # Copy slides
+    for ma in matches:
+        print(f"Checking {ma.name}...")
+        dest_slide = dest_path / ma.name
+
+        # Skip if destination already complete
+        if dest_slide.exists() and is_complete_store(dest_slide):
+            print(f"{ma.name} already complete")
+            continue
+        
+        # Only copy if source is valid
+        if not is_complete_store(ma):
+            print(f"{ma.name} source incomplete, skipping")
+            continue
+        
+        # Remove incomplete destination and copy
+        if dest_slide.exists():
+            rmtree(dest_slide)
+
+        copytree(ma, dest_slide)
+        print(f"Copied {ma.name}")