computational-cell-analytics · constantinpape · Jun 12, 2026 · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -24,10 +24,10 @@ Required in the environment: `scikit-build-core`, `cmake>=3.21`, `ninja`, `nanob
 
 Before introducing union-finds, priority queues, edge hashing, stride math, threading helpers, or label relabeling, check `include/bioimage_cpp/detail/`:
 
-- `detail/union_find.hxx` — `UnionFind` (path compression + union by rank). `find`, `merge`, `merge_to`, `unite_roots`.
+- `util/union_find.hxx` — `UnionFind` (namespace `bioimage_cpp::util`, not `detail`; path compression + union by rank). `find`, `merge`, `merge_to`, `unite_roots`.
 - `detail/indexed_heap.hxx` — addressable max-heap with mutable priorities. `DenseIndexedHeap` (vector-backed locator, integer keys in `[0, N)`) and `SparseIndexedHeap` (hashmap-backed, arbitrary keys).
 - `detail/edge_hash.hxx` — `Edge`, `edge_key`, `EdgeHash` for hashing unordered node pairs.
-- `detail/grid.hxx` — `c_order_strides`, `valid_offset_target`, `is_valid_grid_edge` for row-major grid offsets.
+- `detail/grid.hxx` — `number_of_elements`, `c_order_strides`, `valid_offset_target`, `is_valid_grid_edge` for row-major grid shapes/offsets.
 - `detail/relabel.hxx` — `dense_relabel` to map arbitrary labels onto `[0, k)` preserving first-occurrence order.
 - `detail/threading.hxx` — `normalize_thread_count`, `parallel_for_chunks(n_threads, n_items, chunk)`.
 

diff --git a/development/graph/benchmark_accumulate_labels.py b/development/graph/benchmark_accumulate_labels.py
@@ -0,0 +1,82 @@
+"""Benchmark RAG label accumulation (``accumulate_labels``).
+
+The code review replaced the per-thread ``n_threads x n_nodes`` map-of-maps with
+a single combined-key ``(node, other)`` histogram per thread, and made the
+per-node argmax single-threaded. This script times ``accumulate_labels`` across
+thread counts (to confirm the allocation win and rule out a high-thread-count
+regression from the now-sequential argmax) and across ``other``-label
+cardinalities.
+
+Inputs: a deterministic block-wise over-segmentation (each node is a contiguous
+block, as in a real over-segmentation), so the RAG adjacency is local.
+
+Not part of the test suite. Run::
+
+    python development/graph/benchmark_accumulate_labels.py --repeats 11
+"""
+from __future__ import annotations
+
+import argparse
+from statistics import median
+from time import perf_counter
+
+import numpy as np
+
+import bioimage_cpp as bic
+
+
+def _timeit(fn, repeats: int, warmup: int = 1) -> dict:
+    for _ in range(warmup):
+        fn()
+    timings = []
+    for _ in range(repeats):
+        t0 = perf_counter()
+        fn()
+        timings.append(perf_counter() - t0)
+    return {"median": median(timings), "min": min(timings), "n": repeats}
+
+
+def _make_labels(shape=(40, 256, 256), coarsen=(2, 4, 4)) -> np.ndarray:
+    coarse_shape = tuple(s // c for s, c in zip(shape, coarsen))
+    n_nodes = int(np.prod(coarse_shape))
+    coarse = np.arange(n_nodes, dtype=np.uint64).reshape(coarse_shape)
+    block = np.ones(coarsen, dtype=np.uint64)
+    return np.ascontiguousarray(np.kron(coarse, block))
+
+
+def run(repeats: int = 11) -> dict:
+    labels = _make_labels()
+    rag = bic.graph.region_adjacency_graph(labels)
+    n_nodes = int(rag.number_of_nodes)
+    rng = np.random.default_rng(0)
+
+    results: dict[str, dict] = {}
+    for n_other, tag in ((8, "dense8"), (2000, "sparse2000")):
+        other = rng.integers(0, n_other, size=labels.shape).astype(np.uint64)
+        other = np.ascontiguousarray(other)
+        for threads in (1, 4, 8):
+            def fn(t=threads, o=other):
+                bic.graph.features.accumulate_labels(rag, labels, o, number_of_threads=t)
+
+            key = f"accumulate_labels_{tag}_t{threads}"
+            results[key] = _timeit(fn, repeats)
+            results[key]["meta"] = {
+                "n_nodes": n_nodes,
+                "n_pixels": int(labels.size),
+                "n_other": n_other,
+                "threads": threads,
+            }
+    return results
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--repeats", type=int, default=11)
+    args = parser.parse_args()
+    results = run(repeats=args.repeats)
+    for name, r in results.items():
+        print(f"{name:<32} median={r['median'] * 1e3:8.3f} ms  min={r['min'] * 1e3:8.3f} ms")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/development/graph/benchmark_bfs.py b/development/graph/benchmark_bfs.py
@@ -0,0 +1,70 @@
+"""Benchmark the workspace-reused BFS path (``lifted_edges_from_node_labels``).
+
+The code review changed ``BfsWorkspace::reset`` from an O(N) clear of the
+visited/distance buffers to an O(1) generation-stamp bump. That only pays off
+when one workspace is reset across many sources, which is exactly what
+``lifted_edges_from_node_labels`` does (one workspace per chunk, reset per
+source). The single-call ``breadth_first_search`` builds a fresh workspace each
+call and would NOT show the change, so we benchmark the lifted-edge path.
+
+Single-threaded so the per-source reset cost is not hidden by parallelism. The
+node count is swept to expose the previous O(N^2)-of-memset behavior.
+
+Not part of the test suite. Run::
+
+    python development/graph/benchmark_bfs.py --repeats 5
+"""
+from __future__ import annotations
+
+import argparse
+from statistics import median
+from time import perf_counter
+
+import numpy as np
+
+import bioimage_cpp as bic
+
+
+def _timeit(fn, repeats: int, warmup: int = 1) -> dict:
+    for _ in range(warmup):
+        fn()
+    timings = []
+    for _ in range(repeats):
+        t0 = perf_counter()
+        fn()
+        timings.append(perf_counter() - t0)
+    return {"median": median(timings), "min": min(timings), "n": repeats}
+
+
+def run(repeats: int = 5, depth: int = 2) -> dict:
+    shapes = [(100, 100), (160, 160), (220, 220)]
+    results: dict[str, dict] = {}
+    for shape in shapes:
+        n_nodes = int(np.prod(shape))
+        graph = bic.graph.grid_graph(shape)
+        rng = np.random.default_rng(0)
+        node_labels = rng.integers(0, 50, size=(n_nodes,), dtype=np.uint64)
+
+        def fn(g=graph, nl=node_labels, d=depth):
+            bic.graph.lifted_multicut.lifted_edges_from_node_labels(
+                g, nl, graph_depth=d, number_of_threads=1
+            )
+
+        key = f"bfs_lifted_{n_nodes}nodes_d{depth}"
+        results[key] = _timeit(fn, repeats)
+        results[key]["meta"] = {"n_nodes": n_nodes, "shape": shape, "depth": depth}
+    return results
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--repeats", type=int, default=5)
+    parser.add_argument("--depth", type=int, default=2)
+    args = parser.parse_args()
+    results = run(repeats=args.repeats, depth=args.depth)
+    for name, r in results.items():
+        print(f"{name:<28} median={r['median'] * 1e3:9.3f} ms  min={r['min'] * 1e3:9.3f} ms")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/development/segmentation/benchmark_mutex_watershed.py b/development/segmentation/benchmark_mutex_watershed.py
@@ -0,0 +1,89 @@
+"""Benchmark the mutex-watershed grid kernel (bic only).
+
+The code review routed the neighbor computation in ``mutex_watershed_grid``
+through ``detail::valid_offset_target`` (per-axis bounds check) instead of a
+single precomputed flat-offset add. This script times ``mutex_watershed`` on the
+ISBI affinities for 2D and 3D so we can A/B that inner-loop change against a
+pre-review build. No external (affogato) dependency — bic only.
+
+Not part of the test suite. Run::
+
+    python development/segmentation/benchmark_mutex_watershed.py --repeats 5
+"""
+from __future__ import annotations
+
+import argparse
+from statistics import median
+from time import perf_counter
+
+import numpy as np
+
+import bioimage_cpp as bic
+from bioimage_cpp._data import load_isbi_affinities
+
+
+def _timeit(fn, repeats: int, warmup: int = 1) -> dict:
+    for _ in range(warmup):
+        fn()
+    timings = []
+    for _ in range(repeats):
+        t0 = perf_counter()
+        fn()
+        timings.append(perf_counter() - t0)
+    return {"median": median(timings), "min": min(timings), "n": repeats}
+
+
+def _attractive_flip(affs: np.ndarray, n_attractive: int) -> np.ndarray:
+    # Match the convention in the equivalence checker: attractive channels are
+    # turned into merge affinities (1 - aff).
+    out = affs.copy()
+    out[:n_attractive] *= -1
+    out[:n_attractive] += 1
+    return out
+
+
+def run(repeats: int = 5) -> dict:
+    affinities, offsets = load_isbi_affinities()
+    affinities = np.ascontiguousarray(affinities)
+    offsets = [tuple(o) for o in offsets]
+    results: dict[str, dict] = {}
+
+    # --- 2D: in-plane channels of a single z slice ---
+    channels_2d = [i for i, o in enumerate(offsets) if o[0] == 0]
+    aff2d = np.ascontiguousarray(affinities[channels_2d, 0, :256, :256])
+    offsets_2d = [offsets[i][1:] for i in channels_2d]
+    aff2d_flipped = _attractive_flip(aff2d, 2)
+
+    def mws_2d():
+        bic.segmentation.mutex_watershed(
+            aff2d_flipped, offsets_2d, number_of_attractive_channels=2
+        )
+
+    results["mws_2d_256x256"] = _timeit(mws_2d, repeats)
+    results["mws_2d_256x256"]["meta"] = {"shape": list(aff2d.shape)}
+
+    # --- 3D: small crop, all offsets ---
+    aff3d = np.ascontiguousarray(affinities[:, :6, :256, :256])
+    aff3d_flipped = _attractive_flip(aff3d, 3)
+
+    def mws_3d():
+        bic.segmentation.mutex_watershed(
+            aff3d_flipped, offsets, number_of_attractive_channels=3
+        )
+
+    results["mws_3d_6x256x256"] = _timeit(mws_3d, repeats)
+    results["mws_3d_6x256x256"]["meta"] = {"shape": list(aff3d.shape)}
+    return results
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--repeats", type=int, default=5)
+    args = parser.parse_args()
+    results = run(repeats=args.repeats)
+    for name, r in results.items():
+        print(f"{name:<20} median={r['median'] * 1e3:9.3f} ms  min={r['min'] * 1e3:9.3f} ms")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/development/utils/benchmark_union_find.py b/development/utils/benchmark_union_find.py
@@ -0,0 +1,70 @@
+"""Micro-benchmark for the UnionFind Python bindings.
+
+Times the bulk ``merge((N, 2) edges)`` and ``find(node_array)`` entry points,
+which the code review changed to validate every node id against ``uf.size``
+before touching the C++ structure (an extra O(N) pre-pass). This script lets us
+A/B that pre-pass against a pre-review build.
+
+Not part of the test suite. Run::
+
+    python development/utils/benchmark_union_find.py --repeats 11
+"""
+from __future__ import annotations
+
+import argparse
+from statistics import median
+from time import perf_counter
+
+import numpy as np
+
+import bioimage_cpp as bic
+
+
+def _timeit(fn, repeats: int, warmup: int = 1) -> dict:
+    for _ in range(warmup):
+        fn()
+    timings = []
+    for _ in range(repeats):
+        t0 = perf_counter()
+        fn()
+        timings.append(perf_counter() - t0)
+    return {"median": median(timings), "min": min(timings), "n": repeats}
+
+
+def run(repeats: int = 11, n_nodes: int = 1_000_000, n_edges: int = 2_000_000) -> dict:
+    rng = np.random.default_rng(0)
+    edges = rng.integers(0, n_nodes, size=(n_edges, 2), dtype=np.uint64)
+    query = rng.integers(0, n_nodes, size=(n_edges,), dtype=np.uint64)
+
+    # Pre-merged structure for the find benchmark (find does not mutate the
+    # partition, so it can be reused across repeats).
+    merged = bic.utils.UnionFind(n_nodes)
+    merged.merge(edges)
+
+    def bulk_merge():
+        uf = bic.utils.UnionFind(n_nodes)
+        uf.merge(edges)
+
+    def bulk_find():
+        merged.find(query)
+
+    results = {
+        "uf_bulk_merge": _timeit(bulk_merge, repeats),
+        "uf_bulk_find": _timeit(bulk_find, repeats),
+    }
+    for r in results.values():
+        r["meta"] = {"n_nodes": n_nodes, "n_edges": n_edges}
+    return results
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--repeats", type=int, default=11)
+    args = parser.parse_args()
+    results = run(repeats=args.repeats)
+    for name, r in results.items():
+        print(f"{name:<16} median={r['median'] * 1e3:8.3f} ms  min={r['min'] * 1e3:8.3f} ms")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/include/bioimage_cpp/detail/grid.hxx b/include/bioimage_cpp/detail/grid.hxx
@@ -6,6 +6,16 @@
 
 namespace bioimage_cpp::detail {
 
+// Total number of elements in a row-major array of the given shape (the product
+// of the per-axis extents). Shape entries are assumed non-negative.
+inline std::size_t number_of_elements(const std::vector<std::ptrdiff_t> &shape) {
+    std::size_t total = 1;
+    for (const auto extent : shape) {
+        total *= static_cast<std::size_t>(extent);
+    }
+    return total;
+}
+
 // C-order strides for a row-major array of the given shape, in units of array
 // elements (not bytes). The innermost (last) axis has stride 1.
 inline std::vector<std::ptrdiff_t> c_order_strides(const std::vector<std::ptrdiff_t> &shape) {

diff --git a/include/bioimage_cpp/detail/profile.hxx b/include/bioimage_cpp/detail/profile.hxx
@@ -89,7 +89,12 @@ inline ProfileTimerNull make_profile_timer(NullProfiler &profiler, const char *n
 } // namespace bioimage_cpp::detail
 
 #define BIOIMAGE_PROFILE_INIT(var) ::bioimage_cpp::detail::Profiler var;
-#define BIOIMAGE_PROFILE_SCOPE(var, name) auto _bp_##__LINE__ = ::bioimage_cpp::detail::make_profile_timer(var, name);
+// Two-level indirection so __LINE__ is expanded before the token paste; without
+// it every BIOIMAGE_PROFILE_SCOPE in a translation unit would declare the same
+// identifier `_bp___LINE__`, breaking two scopes in one block.
+#define BIOIMAGE_PROFILE_CONCAT_(a, b) a##b
+#define BIOIMAGE_PROFILE_CONCAT(a, b) BIOIMAGE_PROFILE_CONCAT_(a, b)
+#define BIOIMAGE_PROFILE_SCOPE(var, name) auto BIOIMAGE_PROFILE_CONCAT(_bp_, __LINE__) = ::bioimage_cpp::detail::make_profile_timer(var, name);
 #define BIOIMAGE_PROFILE_REPORT(var) (var).report();
 
 #else

diff --git a/include/bioimage_cpp/flow/flow_density.hxx b/include/bioimage_cpp/flow/flow_density.hxx
@@ -109,7 +109,10 @@ inline std::ptrdiff_t round_to_flat_index(
         } else if (clipped > grid.upper[axis]) {
             clipped = grid.upper[axis];
         }
-        const auto coord = static_cast<std::ptrdiff_t>(std::nearbyint(clipped));
+        // Round half up, matching the nearest-neighbor convention in
+        // transformation/affine.hxx and segmentation/watershed.hxx.
+        // std::nearbyint would honor the FP rounding mode (round-half-to-even).
+        const auto coord = static_cast<std::ptrdiff_t>(std::floor(clipped + 0.5f));
         flat += coord * grid.strides[axis];
     }
     return flat;

diff --git a/include/bioimage_cpp/graph/agglomeration/detail.hxx b/include/bioimage_cpp/graph/agglomeration/detail.hxx
@@ -93,8 +93,9 @@ inline std::size_t agglo_merge_dynamic_nodes(
             if (new_priority != current_priority) {
                 edge.weight = new_priority;
                 // The edge may have been previously popped via kRejectEdge
-                // (GASP cannot-link), in which case it is no longer in the
-                // heap. Use push_or_change to handle both cases.
+                // (GASP cannot-link). In that case it must stay out of the heap
+                // (re-adding it would let it be re-popped and merged), so only
+                // update the priority when the edge is still present.
                 if (heap.contains(removed_edge_id)) {
                     heap.change(removed_edge_id, new_priority);
                 }