From 552249cab15f1ed77bfecd94c31c03260f2fc3c9 Mon Sep 17 00:00:00 2001
From: Constantin Pape <constantin.pape@informatik.uni-goettingen.de>
Date: Thu, 11 Jun 2026 23:55:00 +0200
Subject: [PATCH 1/7] Harden UnionFind bindings and mutex-watershed neighbor
 bounds

B2: validate node ids at the UnionFind binding boundary. UnionFind::find/
merge/merge_to index their parent vectors without a bounds check (intentional
for the hot path), but the scalar and array bindings passed user-supplied ids
straight through, so an out-of-range id was undefined behavior / a segfault
from Python. Add check_node() and route the scalar find/merge/merge_to binds
and the merge_edges/find_nodes helpers through it, throwing std::invalid_argument
before releasing the GIL.

B1: route the mutex-watershed and semantic-mutex-watershed neighbor computation
through detail::valid_offset_target instead of an unchecked flat-index shift.
This bounds-checks the neighbor per axis, so the kernel is memory-safe and
rejects row/plane wrap-around independently of the caller's valid_edges mask.
Drops the now-unused offset_strides precompute.

Add regression tests for the new UnionFind validation and for the existing
binding-layer point-coordinate validation in non_maximum_distance_suppression.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .../non_maximum_distance_suppression.hxx      |  4 +-
 .../segmentation/mutex_watershed.hxx          | 16 +++----
 .../segmentation/semantic_mutex_watershed.hxx | 16 +++----
 src/bindings/util.cxx                         | 46 +++++++++++++++++--
 .../test_non_maximum_distance_suppression.py  | 14 ++++++
 tests/test_util_union_find.py                 | 30 ++++++++++++
 6 files changed, 102 insertions(+), 24 deletions(-)
diff --git a/include/bioimage_cpp/non_maximum_distance_suppression.hxx b/include/bioimage_cpp/non_maximum_distance_suppression.hxx
index 1239e73..3a0806c 100644
--- a/include/bioimage_cpp/non_maximum_distance_suppression.hxx
+++ b/include/bioimage_cpp/non_maximum_distance_suppression.hxx
@@ -81,7 +81,9 @@ inline void non_maximum_distance_suppression(
     const auto strides = bioimage_cpp::detail::c_order_strides(distance_map.shape);
     const auto n = static_cast<std::size_t>(n_points);
 
-    // Precompute flat index and distance value at each point.
+    // Precompute flat index and distance value at each point. Point coordinates
+    // are validated against distance_map bounds in the binding layer before this
+    // is called (see bindings/distance.cxx).
     std::vector<float> point_dist(n);
     for (std::size_t i = 0; i < n; ++i) {
         const auto *row =
diff --git a/include/bioimage_cpp/segmentation/mutex_watershed.hxx b/include/bioimage_cpp/segmentation/mutex_watershed.hxx
index 743cf1a..e75c645 100644
--- a/include/bioimage_cpp/segmentation/mutex_watershed.hxx
+++ b/include/bioimage_cpp/segmentation/mutex_watershed.hxx
@@ -78,13 +78,6 @@ void mutex_watershed_grid(
     ));
     const auto spatial_strides = detail::c_order_strides(spatial_shape);
 
-    std::vector<std::ptrdiff_t> offset_strides(number_of_channels, 0);
-    for (std::size_t channel = 0; channel < number_of_channels; ++channel) {
-        for (std::size_t axis = 0; axis < spatial_ndim; ++axis) {
-            offset_strides[channel] += offsets[channel][axis] * spatial_strides[axis];
-        }
-    }
-
     const auto number_of_edges = number_of_nodes * number_of_channels;
     std::vector<WeightedGridEdge<T>> edge_order;
     edge_order.reserve(static_cast<std::size_t>(number_of_edges));
@@ -108,8 +101,13 @@ void mutex_watershed_grid(
         const auto channel = static_cast<std::size_t>(edge_id / number_of_nodes);
         const auto u = edge_id % number_of_nodes;
 
-        const auto v_signed = static_cast<std::int64_t>(u) + static_cast<std::int64_t>(offset_strides[channel]);
-        const auto v = static_cast<std::uint64_t>(v_signed);
+        // Bounds-check the neighbor per axis rather than relying solely on the
+        // valid_edges mask: this keeps the kernel memory-safe and rejects edges
+        // that would wrap across a row/plane boundary.
+        std::uint64_t v = 0;
+        if (!detail::valid_offset_target(u, offsets[channel], spatial_shape, spatial_strides, v)) {
+            continue;
+        }
         std::uint64_t root_u = sets.find(u);
         std::uint64_t root_v = sets.find(v);
         if (root_u == root_v) {
diff --git a/include/bioimage_cpp/segmentation/semantic_mutex_watershed.hxx b/include/bioimage_cpp/segmentation/semantic_mutex_watershed.hxx
index 0db53d2..5c10f41 100644
--- a/include/bioimage_cpp/segmentation/semantic_mutex_watershed.hxx
+++ b/include/bioimage_cpp/segmentation/semantic_mutex_watershed.hxx
@@ -105,13 +105,6 @@ void semantic_mutex_watershed_grid(
     ));
     const auto spatial_strides = detail::c_order_strides(spatial_shape);
 
-    std::vector<std::ptrdiff_t> offset_strides(number_of_offsets, 0);
-    for (std::size_t channel = 0; channel < number_of_offsets; ++channel) {
-        for (std::size_t axis = 0; axis < spatial_ndim; ++axis) {
-            offset_strides[channel] += offsets[channel][axis] * spatial_strides[axis];
-        }
-    }
-
     struct WeightedGridEdge {
         T weight;
         std::uint64_t id;
@@ -155,9 +148,12 @@ void semantic_mutex_watershed_grid(
             continue;
         }
 
-        const auto v_signed = static_cast<std::int64_t>(u) +
-            static_cast<std::int64_t>(offset_strides[channel]);
-        const auto v = static_cast<std::uint64_t>(v_signed);
+        // Bounds-check the neighbor per axis (see mutex_watershed_grid): keeps
+        // the kernel memory-safe and rejects edges that wrap across a boundary.
+        std::uint64_t v = 0;
+        if (!detail::valid_offset_target(u, offsets[channel], spatial_shape, spatial_strides, v)) {
+            continue;
+        }
         const auto root_u = sets.find(u);
         const auto root_v = sets.find(v);
         if (root_u == root_v) {
diff --git a/src/bindings/util.cxx b/src/bindings/util.cxx
index 90a972e..519c87c 100644
--- a/src/bindings/util.cxx
+++ b/src/bindings/util.cxx
@@ -18,6 +18,18 @@ using EdgeArray = nb::ndarray<nb::numpy, const std::uint64_t, nb::c_contig>;
 using NodeArray = nb::ndarray<nb::numpy, const std::uint64_t, nb::c_contig, nb::ndim<1>>;
 using OutputArray = nb::ndarray<nb::numpy, std::uint64_t, nb::c_contig>;
 
+// UnionFind::find/merge/merge_to index their parent/rank vectors without a
+// bounds check (intentionally, for the hot path). Validate node ids at the
+// binding boundary so out-of-range ids raise a clear error instead of UB.
+void check_node(const util::UnionFind &uf, const std::uint64_t node, const char *name) {
+    if (node >= uf.size()) {
+        throw std::invalid_argument(
+            std::string(name) + " out of range: got " + name + "="
+            + std::to_string(node) + ", size=" + std::to_string(uf.size())
+        );
+    }
+}
+
 void merge_edges(
     util::UnionFind &uf,
     EdgeArray edges
@@ -39,6 +51,11 @@ void merge_edges(
     const auto n_edges = edges.shape(0);
     const auto *data = edges.data();
 
+    for (std::size_t i = 0; i < n_edges; ++i) {
+        check_node(uf, data[2 * i], "edge endpoint");
+        check_node(uf, data[2 * i + 1], "edge endpoint");
+    }
+
     {
         nb::gil_scoped_release release;
         for (std::size_t i = 0; i < n_edges; ++i) {
@@ -49,10 +66,14 @@ void merge_edges(
 
 OutputArray find_nodes(util::UnionFind &uf, NodeArray nodes) {
     const auto n = nodes.shape(0);
+    const auto *input = nodes.data();
+    for (std::size_t i = 0; i < n; ++i) {
+        check_node(uf, input[i], "node");
+    }
+
     auto *out = new std::uint64_t[n]();
     nb::capsule owner(out, [](void *p) noexcept { delete[] static_cast<std::uint64_t *>(p); });
 
-    const auto *input = nodes.data();
     {
         nb::gil_scoped_release release;
         for (std::size_t i = 0; i < n; ++i) {
@@ -64,6 +85,23 @@ OutputArray find_nodes(util::UnionFind &uf, NodeArray nodes) {
     return OutputArray(out, 1, shape, owner);
 }
 
+std::uint64_t find_node(util::UnionFind &uf, const std::uint64_t node) {
+    check_node(uf, node, "node");
+    return uf.find(node);
+}
+
+std::uint64_t merge_pair(util::UnionFind &uf, const std::uint64_t first, const std::uint64_t second) {
+    check_node(uf, first, "first");
+    check_node(uf, second, "second");
+    return uf.merge(first, second);
+}
+
+std::uint64_t merge_to_node(util::UnionFind &uf, const std::uint64_t stable, const std::uint64_t removed) {
+    check_node(uf, stable, "stable");
+    check_node(uf, removed, "removed");
+    return uf.merge_to(stable, removed);
+}
+
 OutputArray element_labeling(util::UnionFind &uf) {
     const auto n = uf.size();
     auto *out = new std::uint64_t[n]();
@@ -98,7 +136,7 @@ void bind_util(nb::module_ &m) {
         )
         .def(
             "find",
-            &util::UnionFind::find,
+            &find_node,
             nb::arg("node"),
             "Return the (path-compressed) root of `node`."
         )
@@ -110,7 +148,7 @@ void bind_util(nb::module_ &m) {
         )
         .def(
             "merge",
-            &util::UnionFind::merge,
+            &merge_pair,
             nb::arg("first"),
             nb::arg("second"),
             "Union the sets containing `first` and `second`. Returns the new root."
@@ -123,7 +161,7 @@ void bind_util(nb::module_ &m) {
         )
         .def(
             "merge_to",
-            &util::UnionFind::merge_to,
+            &merge_to_node,
             nb::arg("stable"),
             nb::arg("removed"),
             "Union the sets containing `stable` and `removed`, forcing "
diff --git a/tests/distance/test_non_maximum_distance_suppression.py b/tests/distance/test_non_maximum_distance_suppression.py
index a152064..194e6d3 100644
--- a/tests/distance/test_non_maximum_distance_suppression.py
+++ b/tests/distance/test_non_maximum_distance_suppression.py
@@ -33,6 +33,20 @@ def test_two_close_points_keeps_higher_value():
     assert out.tolist() == [[5, 5]]
 
 
+def test_out_of_range_point_coordinate_raises():
+    dm = np.zeros((10, 10), dtype=np.float32)
+    pts = np.array([[5, 5], [5, 12]], dtype=np.int64)
+    with pytest.raises(ValueError, match="out of bounds"):
+        nms(dm, pts)
+
+
+def test_negative_point_coordinate_raises():
+    dm = np.zeros((10, 10), dtype=np.float32)
+    pts = np.array([[-1, 5]], dtype=np.int64)
+    with pytest.raises(ValueError, match="out of bounds"):
+        nms(dm, pts)
+
+
 def test_two_far_points_both_survive():
     dm = np.zeros((20, 20), dtype=np.float32)
     dm[2, 2] = 1.0
diff --git a/tests/test_util_union_find.py b/tests/test_util_union_find.py
index 97123bd..9e792b2 100644
--- a/tests/test_util_union_find.py
+++ b/tests/test_util_union_find.py
@@ -141,3 +141,33 @@ def test_bulk_merge_rejects_three_columns():
     uf = bic.utils.UnionFind(3)
     with pytest.raises(Exception, match=r"\(N, 2\)"):
         uf.merge(np.zeros((2, 3), dtype=np.uint64))
+
+
+def test_scalar_find_rejects_out_of_range_node():
+    uf = bic.utils.UnionFind(3)
+    with pytest.raises(ValueError, match="out of range"):
+        uf.find(3)
+
+
+def test_scalar_merge_rejects_out_of_range_node():
+    uf = bic.utils.UnionFind(3)
+    with pytest.raises(ValueError, match="out of range"):
+        uf.merge(0, 5)
+
+
+def test_merge_to_rejects_out_of_range_node():
+    uf = bic.utils.UnionFind(3)
+    with pytest.raises(ValueError, match="out of range"):
+        uf.merge_to(7, 0)
+
+
+def test_bulk_find_rejects_out_of_range_node():
+    uf = bic.utils.UnionFind(3)
+    with pytest.raises(ValueError, match="out of range"):
+        uf.find(np.array([0, 1, 9], dtype=np.uint64))
+
+
+def test_bulk_merge_rejects_out_of_range_node():
+    uf = bic.utils.UnionFind(3)
+    with pytest.raises(ValueError, match="out of range"):
+        uf.merge(np.array([[0, 1], [2, 8]], dtype=np.uint64))

From 38d395ed17470163d6baa7d9581edaa88a7613ea Mon Sep 17 00:00:00 2001
From: Constantin Pape <constantin.pape@informatik.uni-goettingen.de>
Date: Thu, 11 Jun 2026 23:58:31 +0200
Subject: [PATCH 2/7] Fix determinism issues in sorting, proposal seeding, and
 rounding

B3: argsort_by_first broke ties on the primary key nondeterministically
(std::sort is unstable). Add a secondary tie-break by original index so the
ordering is a total order. This makes the label-multiset downsample argmax and
restrict_set truncation deterministic on count ties (smaller id wins).

B4: GreedyAdditiveMulticutProposalGenerator derived its per-call seed as
seed_ + call_count_, which collided with the fusion-move driver's per-slot
seed + slot offset (different (slot, iteration) cells computed identical
proposals). Mix the packed (seed, counter) pair through a SplitMix64 finalizer
to remove the linearity.

B7: WatershedProposalGenerator::reset re-seeded the RNG but left the cached
normal-distribution deviate, so a reset generator was not identical to a fresh
one. Add noise_.reset().

flow: round_to_flat_index used std::nearbyint (FP-rounding-mode dependent,
half-to-even); switch to floor(x + 0.5) to match the half-up convention used
in affine.hxx and watershed.hxx.

Add a determinism regression test for the downsample count-tie case.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 include/bioimage_cpp/flow/flow_density.hxx    |  5 ++++-
 .../greedy_additive_multicut.hxx              | 19 ++++++++++++++++++-
 .../graph/proposal_generators/watershed.hxx   |  3 +++
 .../bioimage_cpp/label_multiset/multiset.hxx  | 14 ++++++++++++--
 tests/label_multiset/test_downsample.py       | 18 ++++++++++++++++++
 5 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/include/bioimage_cpp/flow/flow_density.hxx b/include/bioimage_cpp/flow/flow_density.hxx
index 862cfda..60cbf2d 100644
--- a/include/bioimage_cpp/flow/flow_density.hxx
+++ b/include/bioimage_cpp/flow/flow_density.hxx
@@ -109,7 +109,10 @@ inline std::ptrdiff_t round_to_flat_index(
         } else if (clipped > grid.upper[axis]) {
             clipped = grid.upper[axis];
         }
-        const auto coord = static_cast<std::ptrdiff_t>(std::nearbyint(clipped));
+        // Round half up, matching the nearest-neighbor convention in
+        // transformation/affine.hxx and segmentation/watershed.hxx.
+        // std::nearbyint would honor the FP rounding mode (round-half-to-even).
+        const auto coord = static_cast<std::ptrdiff_t>(std::floor(clipped + 0.5f));
         flat += coord * grid.strides[axis];
     }
     return flat;
diff --git a/include/bioimage_cpp/graph/proposal_generators/greedy_additive_multicut.hxx b/include/bioimage_cpp/graph/proposal_generators/greedy_additive_multicut.hxx
index 4408f7f..d0dc15b 100644
--- a/include/bioimage_cpp/graph/proposal_generators/greedy_additive_multicut.hxx
+++ b/include/bioimage_cpp/graph/proposal_generators/greedy_additive_multicut.hxx
@@ -54,7 +54,7 @@ public:
             weight_stop_,
             node_num_stop_,
             true,
-            seed_ + static_cast<int>(call_count_),
+            mixed_seed(seed_, call_count_),
             sigma_
         );
         ++call_count_;
@@ -65,6 +65,23 @@ public:
     }
 
 private:
+    // Mix the base seed and the call counter so proposals from different
+    // (slot, iteration) cells never realign. A plain `seed_ + call_count_`
+    // collided with the per-slot `seed + slot` offset applied by the fusion-move
+    // driver (slot 0 at iteration i+1 reused the seed of slot 1 at iteration i),
+    // recomputing identical proposals and shrinking effective diversity. A
+    // SplitMix64 finalizer over the packed (seed, counter) pair removes the
+    // linearity.
+    static int mixed_seed(const int base, const std::size_t counter) {
+        std::uint64_t z =
+            (static_cast<std::uint64_t>(static_cast<std::uint32_t>(base)) << 32) |
+            static_cast<std::uint64_t>(static_cast<std::uint32_t>(counter));
+        z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ULL;
+        z = (z ^ (z >> 27)) * 0x94D049BB133111EBULL;
+        z = z ^ (z >> 31);
+        return static_cast<int>(static_cast<std::uint32_t>(z));
+    }
+
     const UndirectedGraph &graph_;
     std::vector<double> edge_costs_;
     double sigma_;
diff --git a/include/bioimage_cpp/graph/proposal_generators/watershed.hxx b/include/bioimage_cpp/graph/proposal_generators/watershed.hxx
index 541c6f1..ed31c8c 100644
--- a/include/bioimage_cpp/graph/proposal_generators/watershed.hxx
+++ b/include/bioimage_cpp/graph/proposal_generators/watershed.hxx
@@ -109,6 +109,9 @@ public:
 
     void reset() override {
         generator_.seed(static_cast<std::mt19937::result_type>(seed_));
+        // std::normal_distribution caches a second Box-Muller deviate; clear it
+        // so a reset generator is bytewise identical to a freshly constructed one.
+        noise_.reset();
     }
 
 private:
diff --git a/include/bioimage_cpp/label_multiset/multiset.hxx b/include/bioimage_cpp/label_multiset/multiset.hxx
index 53e97a1..bc9cda2 100644
--- a/include/bioimage_cpp/label_multiset/multiset.hxx
+++ b/include/bioimage_cpp/label_multiset/multiset.hxx
@@ -24,12 +24,22 @@ template <class V1, class V2>
 inline void argsort_by_first(V1 &v1, V2 &v2, const bool ascending = true) {
     std::vector<std::size_t> idx(v1.size());
     std::iota(idx.begin(), idx.end(), 0);
+    // Break ties on the primary key by original index so the ordering is a
+    // total order. std::sort is unstable, so without this tie-break equal keys
+    // (e.g. equal counts when sorting label entries) would be reordered
+    // nondeterministically across runs / STL implementations.
     if (ascending) {
         std::sort(idx.begin(), idx.end(),
-                  [&v1](std::size_t a, std::size_t b) { return v1[a] < v1[b]; });
+                  [&v1](std::size_t a, std::size_t b) {
+                      if (v1[a] != v1[b]) return v1[a] < v1[b];
+                      return a < b;
+                  });
     } else {
         std::sort(idx.begin(), idx.end(),
-                  [&v1](std::size_t a, std::size_t b) { return v1[a] > v1[b]; });
+                  [&v1](std::size_t a, std::size_t b) {
+                      if (v1[a] != v1[b]) return v1[a] > v1[b];
+                      return a < b;
+                  });
     }
     reorder_inplace(v1, idx);
     reorder_inplace(v2, idx);
diff --git a/tests/label_multiset/test_downsample.py b/tests/label_multiset/test_downsample.py
index a0242b1..61ba50b 100644
--- a/tests/label_multiset/test_downsample.py
+++ b/tests/label_multiset/test_downsample.py
@@ -123,3 +123,21 @@ def test_downsample_restrict_set_keeps_top_k():
     assert ids.tolist() == [0]
     assert counts.tolist() == [15]
     assert ms_top1.argmax.tolist() == [0]
+
+
+def test_downsample_restrict_set_count_tie_is_deterministic():
+    # Two labels with equal counts (8 each). With restrict_set=1 and a count
+    # tie, the argmax / retained entry must be deterministic: the smaller id
+    # wins (the sort breaks ties by id). Run twice to confirm stability.
+    labels = np.empty((4, 4), dtype=np.uint64)
+    labels[:2, :] = 1
+    labels[2:, :] = 2
+    ms_full = multiset_from_labels(labels, (1, 1))
+    b = Blocking([0, 0], [4, 4], [4, 4])
+    results = []
+    for _ in range(2):
+        ms_top1 = downsample_multiset(ms_full, b, restrict_set=1)
+        ids, counts = ms_top1.entry(0)
+        results.append((ms_top1.argmax.tolist(), ids.tolist(), counts.tolist()))
+    assert results[0] == results[1]
+    assert results[0] == ([1], [1], [8])

From 19a19423269ec1f446f0ed4e9501800a1adf8bc7 Mon Sep 17 00:00:00 2001
From: Constantin Pape <constantin.pape@informatik.uni-goettingen.de>
Date: Fri, 12 Jun 2026 00:00:38 +0200
Subject: [PATCH 3/7] Fix BIOIMAGE_PROFILE_SCOPE to generate unique identifiers

`_bp_##__LINE__` does not expand __LINE__ (token-paste suppresses operand
expansion), so every BIOIMAGE_PROFILE_SCOPE declared the same identifier
`_bp___LINE__`. It compiled only because each call site happened to wrap its
scope in its own block; two scopes in one block (the documented per-phase
pattern) would be a redefinition error in BIOIMAGE_PROFILE=ON builds. Add the
standard two-level concat indirection so __LINE__ is expanded first.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 include/bioimage_cpp/detail/profile.hxx | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/bioimage_cpp/detail/profile.hxx b/include/bioimage_cpp/detail/profile.hxx
index 124ab4c..6f1f7ac 100644
--- a/include/bioimage_cpp/detail/profile.hxx
+++ b/include/bioimage_cpp/detail/profile.hxx
@@ -89,7 +89,12 @@ inline ProfileTimerNull make_profile_timer(NullProfiler &profiler, const char *n
 } // namespace bioimage_cpp::detail
 
 #define BIOIMAGE_PROFILE_INIT(var) ::bioimage_cpp::detail::Profiler var;
-#define BIOIMAGE_PROFILE_SCOPE(var, name) auto _bp_##__LINE__ = ::bioimage_cpp::detail::make_profile_timer(var, name);
+// Two-level indirection so __LINE__ is expanded before the token paste; without
+// it every BIOIMAGE_PROFILE_SCOPE in a translation unit would declare the same
+// identifier `_bp___LINE__`, breaking two scopes in one block.
+#define BIOIMAGE_PROFILE_CONCAT_(a, b) a##b
+#define BIOIMAGE_PROFILE_CONCAT(a, b) BIOIMAGE_PROFILE_CONCAT_(a, b)
+#define BIOIMAGE_PROFILE_SCOPE(var, name) auto BIOIMAGE_PROFILE_CONCAT(_bp_, __LINE__) = ::bioimage_cpp::detail::make_profile_timer(var, name);
 #define BIOIMAGE_PROFILE_REPORT(var) (var).report();
 
 #else

From 1946cb4066efe9ecbf939b4f0eca2b1a3f5b8155 Mon Sep 17 00:00:00 2001
From: Constantin Pape <constantin.pape@informatik.uni-goettingen.de>
Date: Fri, 12 Jun 2026 00:06:16 +0200
Subject: [PATCH 4/7] Performance: drop redundant work in greedy-additive, BFS,
 label accumulation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I1: GreedyAdditiveWorkspace::reset called heap.reset_capacity, and so did
detail::initialize_dynamic_graph which every caller runs immediately after.
The second call re-wiped the heap locator vector (O(E)) on every solve — hot
under fusion-move (per fuse x slot x iteration). Drop it from reset() in both
the multicut and lifted-multicut workspaces; initialize_dynamic_graph owns it.

I3: BfsWorkspace::reset cleared the visited/distance buffers (O(N) over the
whole graph) on every call, making the "k-hop neighborhood from every node"
pattern O(N^2) of memset. Replace the boolean visited buffer with a uint32
generation stamp: reset just increments the generation (O(1)), clearing only on
the rare wraparound. distance_ is sized but written before read.

I4: accumulate_labels used an n_threads x n_nodes map-of-maps plus a per-node
merge map. Switch to one combined-key (node, other) histogram per thread,
merged once, with a single-pass per-node argmax. Far fewer container
allocations; the argmax/tie-break/empty-node semantics are unchanged (covered
by the existing accumulate_labels tests, incl. parallel-vs-single and nifty
cross-checks).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .../graph/breadth_first_search.hxx            | 39 ++++++--
 .../bioimage_cpp/graph/label_accumulation.hxx | 94 +++++++++++--------
 .../graph/lifted_multicut/greedy_additive.hxx |  4 +-
 .../graph/multicut/greedy_additive.hxx        |  4 +-
 4 files changed, 93 insertions(+), 48 deletions(-)

diff --git a/include/bioimage_cpp/graph/breadth_first_search.hxx b/include/bioimage_cpp/graph/breadth_first_search.hxx
index 0f140d3..8255d25 100644
--- a/include/bioimage_cpp/graph/breadth_first_search.hxx
+++ b/include/bioimage_cpp/graph/breadth_first_search.hxx
@@ -2,6 +2,7 @@
 
 #include "bioimage_cpp/graph/undirected_graph.hxx"
 
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <limits>
@@ -19,23 +20,44 @@ struct BfsEntry {
 };
 
 // Reusable scratch state for `breadth_first_search`. Reset via `reset(graph)`
-// before each call so the visited-flag and distance buffers grow once and stay
+// before each call so the visited and distance buffers grow once and stay
 // allocated across many BFS invocations on the same graph.
+//
+// Visited tracking uses a per-call generation stamp rather than a boolean
+// buffer cleared each call: `reset` just increments the generation (O(1)),
+// doing a full clear only on the rare 32-bit wraparound. This matters for the
+// "k-hop neighborhood from every node" pattern, where an O(N)-per-call clear
+// would otherwise make the whole sweep O(N^2). `distance_` is sized but not
+// cleared; it is written before being read for every visited node.
 class BfsWorkspace {
 public:
     BfsWorkspace() = default;
 
     void reset(const UndirectedGraph &graph) {
         const auto n_nodes = static_cast<std::size_t>(graph.number_of_nodes());
-        visited_.assign(n_nodes, 0);
-        distance_.assign(n_nodes, 0);
+        if (visited_.size() != n_nodes) {
+            visited_.assign(n_nodes, 0);
+            generation_ = 0;
+        }
+        distance_.resize(n_nodes);
+        if (generation_ == std::numeric_limits<std::uint32_t>::max()) {
+            std::fill(visited_.begin(), visited_.end(), std::uint32_t{0});
+            generation_ = 0;
+        }
+        ++generation_;
     }
 
-    [[nodiscard]] std::vector<unsigned char> &visited() { return visited_; }
+    [[nodiscard]] bool is_visited(const std::uint64_t node) const {
+        return visited_[static_cast<std::size_t>(node)] == generation_;
+    }
+    void mark_visited(const std::uint64_t node) {
+        visited_[static_cast<std::size_t>(node)] = generation_;
+    }
     [[nodiscard]] std::vector<std::uint64_t> &distance() { return distance_; }
 
 private:
-    std::vector<unsigned char> visited_;
+    std::vector<std::uint32_t> visited_;
+    std::uint32_t generation_ = 0;
     std::vector<std::uint64_t> distance_;
 };
 
@@ -71,13 +93,12 @@ inline std::vector<BfsEntry> breadth_first_search(
         );
     }
     workspace.reset(graph);
-    auto &visited = workspace.visited();
     auto &distance = workspace.distance();
 
     std::vector<BfsEntry> result;
     std::queue<std::uint64_t> queue;
     queue.push(source);
-    visited[static_cast<std::size_t>(source)] = 1;
+    workspace.mark_visited(source);
     distance[static_cast<std::size_t>(source)] = 0;
     if (include_source) {
         result.push_back({source, 0});
@@ -93,10 +114,10 @@ inline std::vector<BfsEntry> breadth_first_search(
         const auto next_distance = node_distance + 1;
         for (const auto adjacency : graph.node_adjacency(node)) {
             const auto neighbor = adjacency.node;
-            if (visited[static_cast<std::size_t>(neighbor)] != 0) {
+            if (workspace.is_visited(neighbor)) {
                 continue;
             }
-            visited[static_cast<std::size_t>(neighbor)] = 1;
+            workspace.mark_visited(neighbor);
             distance[static_cast<std::size_t>(neighbor)] = next_distance;
             result.push_back({neighbor, next_distance});
             queue.push(neighbor);
diff --git a/include/bioimage_cpp/graph/label_accumulation.hxx b/include/bioimage_cpp/graph/label_accumulation.hxx
index cb19b5e..5df6a85 100644
--- a/include/bioimage_cpp/graph/label_accumulation.hxx
+++ b/include/bioimage_cpp/graph/label_accumulation.hxx
@@ -26,6 +26,32 @@ inline std::size_t number_of_pixels(const std::vector<std::ptrdiff_t> &shape) {
     ));
 }
 
+// Combined (node, other) histogram key. A single map per thread keyed by this
+// pair avoids the n_threads * n_nodes map-of-maps (one std::unordered_map per
+// node per thread), which dominates allocation for RAGs with many nodes.
+template <class OtherT>
+struct NodeOther {
+    std::uint64_t node;
+    OtherT other;
+    bool operator==(const NodeOther &rhs) const noexcept {
+        return node == rhs.node && other == rhs.other;
+    }
+};
+
+template <class OtherT>
+struct NodeOtherHash {
+    std::size_t operator()(const NodeOther<OtherT> &key) const noexcept {
+        std::uint64_t h = key.node * 0x9E3779B97F4A7C15ULL;
+        h ^= static_cast<std::uint64_t>(key.other) + 0x9E3779B97F4A7C15ULL +
+             (h << 6) + (h >> 2);
+        return static_cast<std::size_t>(h);
+    }
+};
+
+template <class OtherT>
+using NodeOtherHistogram =
+    std::unordered_map<NodeOther<OtherT>, std::uint64_t, NodeOtherHash<OtherT>>;
+
 template <class LabelT, class OtherT>
 void scan_chunk(
     const LabelT *labels,
@@ -34,7 +60,7 @@ void scan_chunk(
     const std::size_t pixel_end,
     const bool has_ignore_value,
     const OtherT ignore_value,
-    std::vector<std::unordered_map<OtherT, std::uint64_t>> &histograms
+    NodeOtherHistogram<OtherT> &histogram
 ) {
     for (std::size_t index = pixel_begin; index < pixel_end; ++index) {
         const auto other = other_labels[index];
@@ -42,7 +68,7 @@ void scan_chunk(
             continue;
         }
         const auto node = detail::checked_label_to_node(labels[index]);
-        ++histograms[static_cast<std::size_t>(node)][other];
+        ++histogram[NodeOther<OtherT>{static_cast<std::uint64_t>(node), other}];
     }
 }
 
@@ -81,10 +107,7 @@ void accumulate_labels(
     const auto n_pixels = detail_label_accumulation::number_of_pixels(labels.shape);
     const auto n_threads = detail::normalize_thread_count(number_of_threads, n_pixels);
 
-    std::vector<std::vector<std::unordered_map<OtherT, std::uint64_t>>> per_thread(
-        n_threads,
-        std::vector<std::unordered_map<OtherT, std::uint64_t>>(number_of_nodes)
-    );
+    std::vector<detail_label_accumulation::NodeOtherHistogram<OtherT>> per_thread(n_threads);
 
     bioimage_cpp::detail::parallel_for_chunks(
         n_threads,
@@ -102,38 +125,35 @@ void accumulate_labels(
         }
     );
 
-    // Merge per-thread histograms and pick majority per node in one pass over
-    // nodes. This is embarrassingly parallel across nodes.
-    const auto node_threads = detail::normalize_thread_count(number_of_threads, number_of_nodes);
-    bioimage_cpp::detail::parallel_for_chunks(
-        node_threads,
-        number_of_nodes,
-        [&](const std::size_t, const std::size_t node_begin, const std::size_t node_end) {
-            for (std::size_t node = node_begin; node < node_end; ++node) {
-                std::unordered_map<OtherT, std::uint64_t> merged;
-                for (auto &thread_histograms : per_thread) {
-                    auto &node_hist = thread_histograms[node];
-                    for (const auto &entry : node_hist) {
-                        merged[entry.first] += entry.second;
-                    }
-                    node_hist.clear();
-                }
-                OtherT best_label = OtherT{0};
-                std::uint64_t best_count = 0;
-                bool has_best = false;
-                for (const auto &entry : merged) {
-                    if (!has_best ||
-                        entry.second > best_count ||
-                        (entry.second == best_count && entry.first < best_label)) {
-                        best_label = entry.first;
-                        best_count = entry.second;
-                        has_best = true;
-                    }
-                }
-                out.data[node] = has_best ? best_label : OtherT{0};
-            }
+    // Merge the per-thread histograms into one, then pick the majority `other`
+    // label per node (smaller label wins ties; nodes with no pixels stay 0).
+    // The argmax pass is single-threaded over distinct (node, other) pairs,
+    // which is cheap relative to the parallel pixel scan above.
+    auto &merged = per_thread[0];
+    for (std::size_t thread_id = 1; thread_id < per_thread.size(); ++thread_id) {
+        for (const auto &entry : per_thread[thread_id]) {
+            merged[entry.first] += entry.second;
         }
-    );
+        detail_label_accumulation::NodeOtherHistogram<OtherT>().swap(per_thread[thread_id]);
+    }
+
+    for (std::size_t node = 0; node < number_of_nodes; ++node) {
+        out.data[node] = OtherT{0};
+    }
+    std::vector<std::uint64_t> best_count(number_of_nodes, 0);
+    std::vector<unsigned char> has_best(number_of_nodes, 0);
+    for (const auto &entry : merged) {
+        const auto node = static_cast<std::size_t>(entry.first.node);
+        const auto other = entry.first.other;
+        const auto count = entry.second;
+        if (has_best[node] == 0 ||
+            count > best_count[node] ||
+            (count == best_count[node] && other < out.data[node])) {
+            out.data[node] = other;
+            best_count[node] = count;
+            has_best[node] = 1;
+        }
+    }
 }
 
 } // namespace bioimage_cpp::graph
diff --git a/include/bioimage_cpp/graph/lifted_multicut/greedy_additive.hxx b/include/bioimage_cpp/graph/lifted_multicut/greedy_additive.hxx
index f694f1f..5e47917 100644
--- a/include/bioimage_cpp/graph/lifted_multicut/greedy_additive.hxx
+++ b/include/bioimage_cpp/graph/lifted_multicut/greedy_additive.hxx
@@ -19,7 +19,9 @@ struct GreedyAdditiveWorkspace {
     void reset(const UndirectedGraph &lifted_graph) {
         dynamic_graph.reset(lifted_graph);
         union_find.reset(static_cast<std::size_t>(lifted_graph.number_of_nodes()));
-        heap.reset_capacity(static_cast<std::size_t>(lifted_graph.number_of_edges()));
+        // The heap is sized by detail::initialize_dynamic_graph, which every
+        // caller runs immediately after reset(); resizing it here too would
+        // wipe the locator vector twice.
     }
 };
 
diff --git a/include/bioimage_cpp/graph/multicut/greedy_additive.hxx b/include/bioimage_cpp/graph/multicut/greedy_additive.hxx
index cafc06d..259a1c6 100644
--- a/include/bioimage_cpp/graph/multicut/greedy_additive.hxx
+++ b/include/bioimage_cpp/graph/multicut/greedy_additive.hxx
@@ -21,7 +21,9 @@ struct GreedyAdditiveWorkspace {
     void reset(const UndirectedGraph &graph) {
         dynamic_graph.reset(graph);
         union_find.reset(static_cast<std::size_t>(graph.number_of_nodes()));
-        heap.reset_capacity(static_cast<std::size_t>(graph.number_of_edges()));
+        // The heap is sized by detail::initialize_dynamic_graph, which every
+        // caller runs immediately after reset(); resizing it here too would
+        // wipe the locator vector twice.
     }
 };
 

From 67c8c88fcb82d772ee14a5090cae8e6c351bccb4 Mon Sep 17 00:00:00 2001
From: Constantin Pape <constantin.pape@informatik.uni-goettingen.de>
Date: Fri, 12 Jun 2026 00:11:51 +0200
Subject: [PATCH 5/7] Deduplicate number_of_pixels and fix doc/naming
 inconsistencies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

R1: three byte-identical number_of_pixels helpers (label_accumulation,
feature_accumulation, node_label_projection) — the feature_accumulation copy
was dead. Add detail::number_of_elements to detail/grid.hxx and route the two
live call sites through it; drop the dead copy.

R9: drop the redundant nb::cast wrappers inside nb::make_tuple in
relabel_sequential_t (make_tuple already casts its arguments).

Docs/naming:
- AGENTS.md: correct the union-find path (util/union_find.hxx, namespace
  bioimage_cpp::util, not detail/) and list number_of_elements under grid.hxx.
- agglomeration/detail.hxx: the rekey comment described the opposite of the
  code; reword to explain that a kRejectEdge'd edge is deliberately left out of
  the heap.
- filters: the public kwarg was window_ratio while the Python API documents
  window_size; align the nb::arg keyword and the validation message to
  window_size (calls are positional, so behavior is unchanged).
- feature_accumulation: note that float feature values are bit-reproducible
  only for a fixed thread count.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 AGENTS.md                                     |  4 +--
 include/bioimage_cpp/detail/grid.hxx          | 10 ++++++
 .../graph/agglomeration/detail.hxx            |  5 +--
 .../graph/feature_accumulation.hxx            | 14 +++-----
 .../bioimage_cpp/graph/label_accumulation.hxx | 13 ++------
 .../graph/node_label_projection.hxx           | 13 ++------
 src/bindings/filters.cxx                      | 32 +++++++++----------
 src/bindings/segmentation.cxx                 |  6 +---
 8 files changed, 41 insertions(+), 56 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 9f0e0f8..1d00dae 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -24,10 +24,10 @@ Required in the environment: `scikit-build-core`, `cmake>=3.21`, `ninja`, `nanob
 
 Before introducing union-finds, priority queues, edge hashing, stride math, threading helpers, or label relabeling, check `include/bioimage_cpp/detail/`:
 
-- `detail/union_find.hxx` — `UnionFind` (path compression + union by rank). `find`, `merge`, `merge_to`, `unite_roots`.
+- `util/union_find.hxx` — `UnionFind` (namespace `bioimage_cpp::util`, not `detail`; path compression + union by rank). `find`, `merge`, `merge_to`, `unite_roots`.
 - `detail/indexed_heap.hxx` — addressable max-heap with mutable priorities. `DenseIndexedHeap` (vector-backed locator, integer keys in `[0, N)`) and `SparseIndexedHeap` (hashmap-backed, arbitrary keys).
 - `detail/edge_hash.hxx` — `Edge`, `edge_key`, `EdgeHash` for hashing unordered node pairs.
-- `detail/grid.hxx` — `c_order_strides`, `valid_offset_target`, `is_valid_grid_edge` for row-major grid offsets.
+- `detail/grid.hxx` — `number_of_elements`, `c_order_strides`, `valid_offset_target`, `is_valid_grid_edge` for row-major grid shapes/offsets.
 - `detail/relabel.hxx` — `dense_relabel` to map arbitrary labels onto `[0, k)` preserving first-occurrence order.
 - `detail/threading.hxx` — `normalize_thread_count`, `parallel_for_chunks(n_threads, n_items, chunk)`.
 
diff --git a/include/bioimage_cpp/detail/grid.hxx b/include/bioimage_cpp/detail/grid.hxx
index 4c0b4f8..d6afc76 100644
--- a/include/bioimage_cpp/detail/grid.hxx
+++ b/include/bioimage_cpp/detail/grid.hxx
@@ -6,6 +6,16 @@
 
 namespace bioimage_cpp::detail {
 
+// Total number of elements in a row-major array of the given shape (the product
+// of the per-axis extents). Shape entries are assumed non-negative.
+inline std::size_t number_of_elements(const std::vector<std::ptrdiff_t> &shape) {
+    std::size_t total = 1;
+    for (const auto extent : shape) {
+        total *= static_cast<std::size_t>(extent);
+    }
+    return total;
+}
+
 // C-order strides for a row-major array of the given shape, in units of array
 // elements (not bytes). The innermost (last) axis has stride 1.
 inline std::vector<std::ptrdiff_t> c_order_strides(const std::vector<std::ptrdiff_t> &shape) {
diff --git a/include/bioimage_cpp/graph/agglomeration/detail.hxx b/include/bioimage_cpp/graph/agglomeration/detail.hxx
index 2bf453c..92c97f3 100644
--- a/include/bioimage_cpp/graph/agglomeration/detail.hxx
+++ b/include/bioimage_cpp/graph/agglomeration/detail.hxx
@@ -93,8 +93,9 @@ inline std::size_t agglo_merge_dynamic_nodes(
             if (new_priority != current_priority) {
                 edge.weight = new_priority;
                 // The edge may have been previously popped via kRejectEdge
-                // (GASP cannot-link), in which case it is no longer in the
-                // heap. Use push_or_change to handle both cases.
+                // (GASP cannot-link). In that case it must stay out of the heap
+                // (re-adding it would let it be re-popped and merged), so only
+                // update the priority when the edge is still present.
                 if (heap.contains(removed_edge_id)) {
                     heap.change(removed_edge_id, new_priority);
                 }
diff --git a/include/bioimage_cpp/graph/feature_accumulation.hxx b/include/bioimage_cpp/graph/feature_accumulation.hxx
index 4b2db1f..fbdead4 100644
--- a/include/bioimage_cpp/graph/feature_accumulation.hxx
+++ b/include/bioimage_cpp/graph/feature_accumulation.hxx
@@ -88,15 +88,6 @@ inline double percentile(std::vector<double> &values, const double percentile) {
     return (1.0 - weight) * lower + weight * upper;
 }
 
-inline std::size_t number_of_pixels(const std::vector<std::ptrdiff_t> &shape) {
-    return static_cast<std::size_t>(std::accumulate(
-        shape.begin(),
-        shape.end(),
-        std::ptrdiff_t{1},
-        [](const std::ptrdiff_t a, const std::ptrdiff_t b) { return a * b; }
-    ));
-}
-
 inline void require_same_spatial_shape(
     const std::vector<std::ptrdiff_t> &labels_shape,
     const std::vector<std::ptrdiff_t> &data_shape,
@@ -370,6 +361,11 @@ void scan_affinity_3d_chunk(
     }
 }
 
+// Combine per-thread partial statistics. count/min/max/percentile results are
+// exact regardless of thread count; the floating-point sum / sum-of-squares
+// (and hence mean / std) can differ in the last ULP between thread counts,
+// because each thread sums a different subset of pixels. Float feature values
+// are therefore bit-reproducible only for a fixed thread count.
 template <class Stats>
 std::vector<Stats> merge_stats(
     std::vector<std::vector<Stats>> &per_thread_stats,
diff --git a/include/bioimage_cpp/graph/label_accumulation.hxx b/include/bioimage_cpp/graph/label_accumulation.hxx
index 5df6a85..d0ed3d4 100644
--- a/include/bioimage_cpp/graph/label_accumulation.hxx
+++ b/include/bioimage_cpp/graph/label_accumulation.hxx
@@ -1,13 +1,13 @@
 #pragma once
 
 #include "bioimage_cpp/array_view.hxx"
+#include "bioimage_cpp/detail/grid.hxx"
 #include "bioimage_cpp/detail/threading.hxx"
 #include "bioimage_cpp/graph/node_label_projection.hxx"
 #include "bioimage_cpp/graph/region_adjacency_graph.hxx"
 
 #include <cstddef>
 #include <cstdint>
-#include <numeric>
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
@@ -17,15 +17,6 @@ namespace bioimage_cpp::graph {
 
 namespace detail_label_accumulation {
 
-inline std::size_t number_of_pixels(const std::vector<std::ptrdiff_t> &shape) {
-    return static_cast<std::size_t>(std::accumulate(
-        shape.begin(),
-        shape.end(),
-        std::ptrdiff_t{1},
-        [](const std::ptrdiff_t a, const std::ptrdiff_t b) { return a * b; }
-    ));
-}
-
 // Combined (node, other) histogram key. A single map per thread keyed by this
 // pair avoids the n_threads * n_nodes map-of-maps (one std::unordered_map per
 // node per thread), which dominates allocation for RAGs with many nodes.
@@ -104,7 +95,7 @@ void accumulate_labels(
         throw std::invalid_argument("labels contain a node id outside the rag");
     }
 
-    const auto n_pixels = detail_label_accumulation::number_of_pixels(labels.shape);
+    const auto n_pixels = bioimage_cpp::detail::number_of_elements(labels.shape);
     const auto n_threads = detail::normalize_thread_count(number_of_threads, n_pixels);
 
     std::vector<detail_label_accumulation::NodeOtherHistogram<OtherT>> per_thread(n_threads);
diff --git a/include/bioimage_cpp/graph/node_label_projection.hxx b/include/bioimage_cpp/graph/node_label_projection.hxx
index afc8439..d784da3 100644
--- a/include/bioimage_cpp/graph/node_label_projection.hxx
+++ b/include/bioimage_cpp/graph/node_label_projection.hxx
@@ -1,12 +1,12 @@
 #pragma once
 
 #include "bioimage_cpp/array_view.hxx"
+#include "bioimage_cpp/detail/grid.hxx"
 #include "bioimage_cpp/detail/threading.hxx"
 #include "bioimage_cpp/graph/region_adjacency_graph.hxx"
 
 #include <cstddef>
 #include <cstdint>
-#include <numeric>
 #include <stdexcept>
 #include <string>
 #include <vector>
@@ -15,15 +15,6 @@ namespace bioimage_cpp::graph {
 
 namespace detail_projection {
 
-inline std::size_t number_of_pixels(const std::vector<std::ptrdiff_t> &shape) {
-    return static_cast<std::size_t>(std::accumulate(
-        shape.begin(),
-        shape.end(),
-        std::ptrdiff_t{1},
-        [](const std::ptrdiff_t a, const std::ptrdiff_t b) { return a * b; }
-    ));
-}
-
 inline void require_rag_shape_matches_labels(
     const RegionAdjacencyGraph &rag,
     const std::vector<std::ptrdiff_t> &labels_shape
@@ -69,7 +60,7 @@ void project_node_labels_to_pixels(
         throw std::invalid_argument("labels contain a node id outside node_labels");
     }
 
-    const auto n_pixels = detail_projection::number_of_pixels(labels.shape);
+    const auto n_pixels = bioimage_cpp::detail::number_of_elements(labels.shape);
     const auto n_threads = detail::normalize_thread_count(number_of_threads, n_pixels);
     bioimage_cpp::detail::parallel_for_chunks(
         n_threads,
diff --git a/src/bindings/filters.cxx b/src/bindings/filters.cxx
index 396cb63..874d56b 100644
--- a/src/bindings/filters.cxx
+++ b/src/bindings/filters.cxx
@@ -43,11 +43,11 @@ void require_order(int order, const char *name, const char *function) {
     }
 }
 
-void require_non_negative_window(double window_ratio, const char *function) {
-    if (window_ratio < 0.0) {
+void require_non_negative_window(double window_size, const char *function) {
+    if (window_size < 0.0) {
         throw std::invalid_argument(
-            std::string(function) + ": window_ratio must be >= 0 (0 selects the "
-            "default), got " + std::to_string(window_ratio)
+            std::string(function) + ": window_size must be >= 0 (0 selects the "
+            "default), got " + std::to_string(window_size)
         );
     }
 }
@@ -476,21 +476,21 @@ void bind_filters(nb::module_ &m) {
     m.def(
         "_gaussian_smoothing_2d_float32", &gaussian_smoothing_2d,
         nb::arg("image"), nb::arg("sigma_y"), nb::arg("sigma_x"),
-        nb::arg("window_ratio") = 0.0,
+        nb::arg("window_size") = 0.0,
         "2D Gaussian smoothing on a float32 (ny, nx) image with anisotropic sigma."
     );
     m.def(
         "_gaussian_smoothing_3d_float32", &gaussian_smoothing_3d,
         nb::arg("image"),
         nb::arg("sigma_z"), nb::arg("sigma_y"), nb::arg("sigma_x"),
-        nb::arg("window_ratio") = 0.0,
+        nb::arg("window_size") = 0.0,
         "3D Gaussian smoothing on a float32 (nz, ny, nx) image with anisotropic sigma."
     );
     m.def(
         "_gaussian_derivative_2d_float32", &gaussian_derivative_2d,
         nb::arg("image"), nb::arg("sigma_y"), nb::arg("sigma_x"),
         nb::arg("order_y"), nb::arg("order_x"),
-        nb::arg("window_ratio") = 0.0,
+        nb::arg("window_size") = 0.0,
         "2D Gaussian derivative on a float32 (ny, nx) image with per-axis order."
     );
     m.def(
@@ -498,39 +498,39 @@ void bind_filters(nb::module_ &m) {
         nb::arg("image"),
         nb::arg("sigma_z"), nb::arg("sigma_y"), nb::arg("sigma_x"),
         nb::arg("order_z"), nb::arg("order_y"), nb::arg("order_x"),
-        nb::arg("window_ratio") = 0.0,
+        nb::arg("window_size") = 0.0,
         "3D Gaussian derivative on a float32 (nz, ny, nx) image with per-axis order."
     );
     m.def(
         "_gaussian_gradient_magnitude_2d_float32", &gaussian_gradient_magnitude_2d,
         nb::arg("image"), nb::arg("sigma_y"), nb::arg("sigma_x"),
-        nb::arg("window_ratio") = 0.0,
+        nb::arg("window_size") = 0.0,
         "Gradient magnitude of a Gaussian-smoothed 2D float32 image."
     );
     m.def(
         "_gaussian_gradient_magnitude_3d_float32", &gaussian_gradient_magnitude_3d,
         nb::arg("image"),
         nb::arg("sigma_z"), nb::arg("sigma_y"), nb::arg("sigma_x"),
-        nb::arg("window_ratio") = 0.0,
+        nb::arg("window_size") = 0.0,
         "Gradient magnitude of a Gaussian-smoothed 3D float32 image."
     );
     m.def(
         "_laplacian_of_gaussian_2d_float32", &laplacian_of_gaussian_2d,
         nb::arg("image"), nb::arg("sigma_y"), nb::arg("sigma_x"),
-        nb::arg("window_ratio") = 0.0,
+        nb::arg("window_size") = 0.0,
         "Laplacian of Gaussian on a 2D float32 image."
     );
     m.def(
         "_laplacian_of_gaussian_3d_float32", &laplacian_of_gaussian_3d,
         nb::arg("image"),
         nb::arg("sigma_z"), nb::arg("sigma_y"), nb::arg("sigma_x"),
-        nb::arg("window_ratio") = 0.0,
+        nb::arg("window_size") = 0.0,
         "Laplacian of Gaussian on a 3D float32 image."
     );
     m.def(
         "_hessian_of_gaussian_eigenvalues_2d_float32", &hessian_of_gaussian_eigenvalues_2d,
         nb::arg("image"), nb::arg("sigma_y"), nb::arg("sigma_x"),
-        nb::arg("window_ratio") = 0.0,
+        nb::arg("window_size") = 0.0,
         "Eigenvalues of the Hessian of Gaussian on a 2D float32 image. "
         "Output shape: (ny, nx, 2), sorted descending along the trailing axis."
     );
@@ -538,7 +538,7 @@ void bind_filters(nb::module_ &m) {
         "_hessian_of_gaussian_eigenvalues_3d_float32", &hessian_of_gaussian_eigenvalues_3d,
         nb::arg("image"),
         nb::arg("sigma_z"), nb::arg("sigma_y"), nb::arg("sigma_x"),
-        nb::arg("window_ratio") = 0.0,
+        nb::arg("window_size") = 0.0,
         "Eigenvalues of the Hessian of Gaussian on a 3D float32 image. "
         "Output shape: (nz, ny, nx, 3), sorted descending along the trailing axis."
     );
@@ -547,7 +547,7 @@ void bind_filters(nb::module_ &m) {
         nb::arg("image"),
         nb::arg("sigma_inner_y"), nb::arg("sigma_inner_x"),
         nb::arg("sigma_outer_y"), nb::arg("sigma_outer_x"),
-        nb::arg("window_ratio") = 0.0,
+        nb::arg("window_size") = 0.0,
         "Eigenvalues of the structure tensor on a 2D float32 image. "
         "Output shape: (ny, nx, 2), sorted descending along the trailing axis."
     );
@@ -556,7 +556,7 @@ void bind_filters(nb::module_ &m) {
         nb::arg("image"),
         nb::arg("sigma_inner_z"), nb::arg("sigma_inner_y"), nb::arg("sigma_inner_x"),
         nb::arg("sigma_outer_z"), nb::arg("sigma_outer_y"), nb::arg("sigma_outer_x"),
-        nb::arg("window_ratio") = 0.0,
+        nb::arg("window_size") = 0.0,
         "Eigenvalues of the structure tensor on a 3D float32 image. "
         "Output shape: (nz, ny, nx, 3), sorted descending along the trailing axis."
     );
diff --git a/src/bindings/segmentation.cxx b/src/bindings/segmentation.cxx
index 3b8314b..39891bd 100644
--- a/src/bindings/segmentation.cxx
+++ b/src/bindings/segmentation.cxx
@@ -392,11 +392,7 @@ nb::tuple relabel_sequential_t(
         inverse_owner
     );
 
-    return nb::make_tuple(
-        nb::cast(relabeled_array),
-        nb::cast(forward_array),
-        nb::cast(inverse_array)
-    );
+    return nb::make_tuple(relabeled_array, forward_array, inverse_array);
 }
 
 template <class InT>

From 0f0c5407b85ae2c4f55fb8094075a068085e911a Mon Sep 17 00:00:00 2001
From: Constantin Pape <constantin.pape@informatik.uni-goettingen.de>
Date: Fri, 12 Jun 2026 07:37:01 +0200
Subject: [PATCH 6/7] Revert two perf regressions found by A/B benchmarking;
 trim a third

A/B benchmarks (pre-review build vs the review branch) showed two of the
review's changes were net-negative, so they are reverted:

I4 (label accumulation): the combined-key (node, other) histogram was 3-15x
SLOWER than the per-node map-of-maps and scaled negatively with threads (a giant
hashmap probed once per pixel, plus a serial argmax). Restore the per-node
map-of-maps with the parallel merge. The detail::number_of_elements dedup (R1)
is kept. Tests confirm identical output.

B1 (mutex watershed): routing the neighbor through detail::valid_offset_target
(per-axis bounds check) cost ~17% on 3D in the hot loop. The public API already
guarantees valid_edges excludes out-of-bounds / wrapping edges, so restore the
single precomputed offset-stride add and document that precondition instead.

B2 (UnionFind bindings): keep the safety bounds check but replace the
per-element validation loop with a single std::max_element scan, cutting bulk
find overhead from ~1.35x to ~1.1x. The residual cost is the unavoidable extra
read of the id array; it only affects the non-hot Python convenience API.

BFS (I3) stays 4.5x faster; greedy-additive (I1) unchanged.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .../bioimage_cpp/graph/label_accumulation.hxx | 97 ++++++++-----------
 .../segmentation/mutex_watershed.hxx          | 21 ++--
 .../segmentation/semantic_mutex_watershed.hxx | 20 ++--
 src/bindings/util.cxx                         | 25 +++--
 4 files changed, 86 insertions(+), 77 deletions(-)

diff --git a/include/bioimage_cpp/graph/label_accumulation.hxx b/include/bioimage_cpp/graph/label_accumulation.hxx
index d0ed3d4..1f2376f 100644
--- a/include/bioimage_cpp/graph/label_accumulation.hxx
+++ b/include/bioimage_cpp/graph/label_accumulation.hxx
@@ -17,32 +17,6 @@ namespace bioimage_cpp::graph {
 
 namespace detail_label_accumulation {
 
-// Combined (node, other) histogram key. A single map per thread keyed by this
-// pair avoids the n_threads * n_nodes map-of-maps (one std::unordered_map per
-// node per thread), which dominates allocation for RAGs with many nodes.
-template <class OtherT>
-struct NodeOther {
-    std::uint64_t node;
-    OtherT other;
-    bool operator==(const NodeOther &rhs) const noexcept {
-        return node == rhs.node && other == rhs.other;
-    }
-};
-
-template <class OtherT>
-struct NodeOtherHash {
-    std::size_t operator()(const NodeOther<OtherT> &key) const noexcept {
-        std::uint64_t h = key.node * 0x9E3779B97F4A7C15ULL;
-        h ^= static_cast<std::uint64_t>(key.other) + 0x9E3779B97F4A7C15ULL +
-             (h << 6) + (h >> 2);
-        return static_cast<std::size_t>(h);
-    }
-};
-
-template <class OtherT>
-using NodeOtherHistogram =
-    std::unordered_map<NodeOther<OtherT>, std::uint64_t, NodeOtherHash<OtherT>>;
-
 template <class LabelT, class OtherT>
 void scan_chunk(
     const LabelT *labels,
@@ -51,7 +25,7 @@ void scan_chunk(
     const std::size_t pixel_end,
     const bool has_ignore_value,
     const OtherT ignore_value,
-    NodeOtherHistogram<OtherT> &histogram
+    std::vector<std::unordered_map<OtherT, std::uint64_t>> &histograms
 ) {
     for (std::size_t index = pixel_begin; index < pixel_end; ++index) {
         const auto other = other_labels[index];
@@ -59,7 +33,7 @@ void scan_chunk(
             continue;
         }
         const auto node = detail::checked_label_to_node(labels[index]);
-        ++histogram[NodeOther<OtherT>{static_cast<std::uint64_t>(node), other}];
+        ++histograms[static_cast<std::size_t>(node)][other];
     }
 }
 
@@ -98,7 +72,10 @@ void accumulate_labels(
     const auto n_pixels = bioimage_cpp::detail::number_of_elements(labels.shape);
     const auto n_threads = detail::normalize_thread_count(number_of_threads, n_pixels);
 
-    std::vector<detail_label_accumulation::NodeOtherHistogram<OtherT>> per_thread(n_threads);
+    std::vector<std::vector<std::unordered_map<OtherT, std::uint64_t>>> per_thread(
+        n_threads,
+        std::vector<std::unordered_map<OtherT, std::uint64_t>>(number_of_nodes)
+    );
 
     bioimage_cpp::detail::parallel_for_chunks(
         n_threads,
@@ -116,35 +93,41 @@ void accumulate_labels(
         }
     );
 
-    // Merge the per-thread histograms into one, then pick the majority `other`
-    // label per node (smaller label wins ties; nodes with no pixels stay 0).
-    // The argmax pass is single-threaded over distinct (node, other) pairs,
-    // which is cheap relative to the parallel pixel scan above.
-    auto &merged = per_thread[0];
-    for (std::size_t thread_id = 1; thread_id < per_thread.size(); ++thread_id) {
-        for (const auto &entry : per_thread[thread_id]) {
-            merged[entry.first] += entry.second;
-        }
-        detail_label_accumulation::NodeOtherHistogram<OtherT>().swap(per_thread[thread_id]);
-    }
-
-    for (std::size_t node = 0; node < number_of_nodes; ++node) {
-        out.data[node] = OtherT{0};
-    }
-    std::vector<std::uint64_t> best_count(number_of_nodes, 0);
-    std::vector<unsigned char> has_best(number_of_nodes, 0);
-    for (const auto &entry : merged) {
-        const auto node = static_cast<std::size_t>(entry.first.node);
-        const auto other = entry.first.other;
-        const auto count = entry.second;
-        if (has_best[node] == 0 ||
-            count > best_count[node] ||
-            (count == best_count[node] && other < out.data[node])) {
-            out.data[node] = other;
-            best_count[node] = count;
-            has_best[node] = 1;
+    // Merge per-thread histograms and pick majority per node in one pass over
+    // nodes. This is embarrassingly parallel across nodes. (A single combined
+    // (node, other) hashmap per thread was tried and was 3-15x slower — a giant
+    // map probed per pixel plus a serial argmax — so the per-node map-of-maps
+    // with a parallel merge stays.)
+    const auto node_threads = detail::normalize_thread_count(number_of_threads, number_of_nodes);
+    bioimage_cpp::detail::parallel_for_chunks(
+        node_threads,
+        number_of_nodes,
+        [&](const std::size_t, const std::size_t node_begin, const std::size_t node_end) {
+            for (std::size_t node = node_begin; node < node_end; ++node) {
+                std::unordered_map<OtherT, std::uint64_t> merged;
+                for (auto &thread_histograms : per_thread) {
+                    auto &node_hist = thread_histograms[node];
+                    for (const auto &entry : node_hist) {
+                        merged[entry.first] += entry.second;
+                    }
+                    node_hist.clear();
+                }
+                OtherT best_label = OtherT{0};
+                std::uint64_t best_count = 0;
+                bool has_best = false;
+                for (const auto &entry : merged) {
+                    if (!has_best ||
+                        entry.second > best_count ||
+                        (entry.second == best_count && entry.first < best_label)) {
+                        best_label = entry.first;
+                        best_count = entry.second;
+                        has_best = true;
+                    }
+                }
+                out.data[node] = has_best ? best_label : OtherT{0};
+            }
         }
-    }
+    );
 }
 
 } // namespace bioimage_cpp::graph
diff --git a/include/bioimage_cpp/segmentation/mutex_watershed.hxx b/include/bioimage_cpp/segmentation/mutex_watershed.hxx
index e75c645..ec84887 100644
--- a/include/bioimage_cpp/segmentation/mutex_watershed.hxx
+++ b/include/bioimage_cpp/segmentation/mutex_watershed.hxx
@@ -78,6 +78,18 @@ void mutex_watershed_grid(
     ));
     const auto spatial_strides = detail::c_order_strides(spatial_shape);
 
+    // Precondition: `valid_edges` must be 0 for every out-of-bounds / boundary-
+    // wrapping edge (the Python wrapper guarantees this). Given that, the
+    // neighbor flat index can be computed with a single precomputed stride add
+    // per edge instead of a per-axis bounds check, which matters in this hot
+    // loop (a per-axis valid_offset_target check measured ~17% slower on 3D).
+    std::vector<std::ptrdiff_t> offset_strides(number_of_channels, 0);
+    for (std::size_t channel = 0; channel < number_of_channels; ++channel) {
+        for (std::size_t axis = 0; axis < spatial_ndim; ++axis) {
+            offset_strides[channel] += offsets[channel][axis] * spatial_strides[axis];
+        }
+    }
+
     const auto number_of_edges = number_of_nodes * number_of_channels;
     std::vector<WeightedGridEdge<T>> edge_order;
     edge_order.reserve(static_cast<std::size_t>(number_of_edges));
@@ -101,13 +113,8 @@ void mutex_watershed_grid(
         const auto channel = static_cast<std::size_t>(edge_id / number_of_nodes);
         const auto u = edge_id % number_of_nodes;
 
-        // Bounds-check the neighbor per axis rather than relying solely on the
-        // valid_edges mask: this keeps the kernel memory-safe and rejects edges
-        // that would wrap across a row/plane boundary.
-        std::uint64_t v = 0;
-        if (!detail::valid_offset_target(u, offsets[channel], spatial_shape, spatial_strides, v)) {
-            continue;
-        }
+        const auto v_signed = static_cast<std::int64_t>(u) + static_cast<std::int64_t>(offset_strides[channel]);
+        const auto v = static_cast<std::uint64_t>(v_signed);
         std::uint64_t root_u = sets.find(u);
         std::uint64_t root_v = sets.find(v);
         if (root_u == root_v) {
diff --git a/include/bioimage_cpp/segmentation/semantic_mutex_watershed.hxx b/include/bioimage_cpp/segmentation/semantic_mutex_watershed.hxx
index 5c10f41..b17a963 100644
--- a/include/bioimage_cpp/segmentation/semantic_mutex_watershed.hxx
+++ b/include/bioimage_cpp/segmentation/semantic_mutex_watershed.hxx
@@ -105,6 +105,17 @@ void semantic_mutex_watershed_grid(
     ));
     const auto spatial_strides = detail::c_order_strides(spatial_shape);
 
+    // Precondition: `valid_edges` is 0 for every out-of-bounds / boundary-
+    // wrapping spatial edge (guaranteed by the Python wrapper), so the neighbor
+    // flat index uses a single precomputed stride add per edge rather than a
+    // per-axis bounds check in this hot loop.
+    std::vector<std::ptrdiff_t> offset_strides(number_of_offsets, 0);
+    for (std::size_t channel = 0; channel < number_of_offsets; ++channel) {
+        for (std::size_t axis = 0; axis < spatial_ndim; ++axis) {
+            offset_strides[channel] += offsets[channel][axis] * spatial_strides[axis];
+        }
+    }
+
     struct WeightedGridEdge {
         T weight;
         std::uint64_t id;
@@ -148,12 +159,9 @@ void semantic_mutex_watershed_grid(
             continue;
         }
 
-        // Bounds-check the neighbor per axis (see mutex_watershed_grid): keeps
-        // the kernel memory-safe and rejects edges that wrap across a boundary.
-        std::uint64_t v = 0;
-        if (!detail::valid_offset_target(u, offsets[channel], spatial_shape, spatial_strides, v)) {
-            continue;
-        }
+        const auto v_signed = static_cast<std::int64_t>(u) +
+            static_cast<std::int64_t>(offset_strides[channel]);
+        const auto v = static_cast<std::uint64_t>(v_signed);
         const auto root_u = sets.find(u);
         const auto root_v = sets.find(v);
         if (root_u == root_v) {
diff --git a/src/bindings/util.cxx b/src/bindings/util.cxx
index 519c87c..7edce98 100644
--- a/src/bindings/util.cxx
+++ b/src/bindings/util.cxx
@@ -4,6 +4,7 @@
 
 #include <nanobind/ndarray.h>
 
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <stdexcept>
@@ -30,6 +31,21 @@ void check_node(const util::UnionFind &uf, const std::uint64_t node, const char
     }
 }
 
+// Bulk variant: a single max-scan over the ids (vectorizable, one pass) instead
+// of a per-element branch, so validating a large id array stays cheap.
+void check_nodes(const util::UnionFind &uf, const std::uint64_t *data, std::size_t count, const char *name) {
+    if (count == 0) {
+        return;
+    }
+    const auto max_node = *std::max_element(data, data + count);
+    if (max_node >= uf.size()) {
+        throw std::invalid_argument(
+            std::string(name) + " out of range: got " + name + "="
+            + std::to_string(max_node) + ", size=" + std::to_string(uf.size())
+        );
+    }
+}
+
 void merge_edges(
     util::UnionFind &uf,
     EdgeArray edges
@@ -51,10 +67,7 @@ void merge_edges(
     const auto n_edges = edges.shape(0);
     const auto *data = edges.data();
 
-    for (std::size_t i = 0; i < n_edges; ++i) {
-        check_node(uf, data[2 * i], "edge endpoint");
-        check_node(uf, data[2 * i + 1], "edge endpoint");
-    }
+    check_nodes(uf, data, 2 * n_edges, "edge endpoint");
 
     {
         nb::gil_scoped_release release;
@@ -67,9 +80,7 @@ void merge_edges(
 OutputArray find_nodes(util::UnionFind &uf, NodeArray nodes) {
     const auto n = nodes.shape(0);
     const auto *input = nodes.data();
-    for (std::size_t i = 0; i < n; ++i) {
-        check_node(uf, input[i], "node");
-    }
+    check_nodes(uf, input, n, "node");
 
     auto *out = new std::uint64_t[n]();
     nb::capsule owner(out, [](void *p) noexcept { delete[] static_cast<std::uint64_t *>(p); });

From 66ed315e53ecef199a10346c3649867941c8c24f Mon Sep 17 00:00:00 2001
From: Constantin Pape <constantin.pape@informatik.uni-goettingen.de>
Date: Fri, 12 Jun 2026 07:37:01 +0200
Subject: [PATCH 7/7] Add bic-only timing benchmarks for union-find,
 accumulate-labels, BFS, mutex-ws

Standalone (no nifty/affogato) benchmarks for the areas touched by the code
review that lacked timing coverage, used for the A/B perf validation:
- utils/benchmark_union_find.py: bulk merge/find (B2 bounds-check cost)
- graph/benchmark_accumulate_labels.py: accumulate_labels across 1/4/8 threads
- graph/benchmark_bfs.py: lifted_edges_from_node_labels (workspace-reused BFS, I3)
- segmentation/benchmark_mutex_watershed.py: mutex_watershed grid kernel (B1)

Each follows the warmup + median/min pattern and exposes run() for a driver.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .../graph/benchmark_accumulate_labels.py      | 82 +++++++++++++++++
 development/graph/benchmark_bfs.py            | 70 +++++++++++++++
 .../segmentation/benchmark_mutex_watershed.py | 89 +++++++++++++++++++
 development/utils/benchmark_union_find.py     | 70 +++++++++++++++
 4 files changed, 311 insertions(+)
 create mode 100644 development/graph/benchmark_accumulate_labels.py
 create mode 100644 development/graph/benchmark_bfs.py
 create mode 100644 development/segmentation/benchmark_mutex_watershed.py
 create mode 100644 development/utils/benchmark_union_find.py

diff --git a/development/graph/benchmark_accumulate_labels.py b/development/graph/benchmark_accumulate_labels.py
new file mode 100644
index 0000000..6327ed7
--- /dev/null
+++ b/development/graph/benchmark_accumulate_labels.py
@@ -0,0 +1,82 @@
+"""Benchmark RAG label accumulation (``accumulate_labels``).
+
+The code review replaced the per-thread ``n_threads x n_nodes`` map-of-maps with
+a single combined-key ``(node, other)`` histogram per thread, and made the
+per-node argmax single-threaded. This script times ``accumulate_labels`` across
+thread counts (to confirm the allocation win and rule out a high-thread-count
+regression from the now-sequential argmax) and across ``other``-label
+cardinalities.
+
+Inputs: a deterministic block-wise over-segmentation (each node is a contiguous
+block, as in a real over-segmentation), so the RAG adjacency is local.
+
+Not part of the test suite. Run::
+
+    python development/graph/benchmark_accumulate_labels.py --repeats 11
+"""
+from __future__ import annotations
+
+import argparse
+from statistics import median
+from time import perf_counter
+
+import numpy as np
+
+import bioimage_cpp as bic
+
+
+def _timeit(fn, repeats: int, warmup: int = 1) -> dict:
+    for _ in range(warmup):
+        fn()
+    timings = []
+    for _ in range(repeats):
+        t0 = perf_counter()
+        fn()
+        timings.append(perf_counter() - t0)
+    return {"median": median(timings), "min": min(timings), "n": repeats}
+
+
+def _make_labels(shape=(40, 256, 256), coarsen=(2, 4, 4)) -> np.ndarray:
+    coarse_shape = tuple(s // c for s, c in zip(shape, coarsen))
+    n_nodes = int(np.prod(coarse_shape))
+    coarse = np.arange(n_nodes, dtype=np.uint64).reshape(coarse_shape)
+    block = np.ones(coarsen, dtype=np.uint64)
+    return np.ascontiguousarray(np.kron(coarse, block))
+
+
+def run(repeats: int = 11) -> dict:
+    labels = _make_labels()
+    rag = bic.graph.region_adjacency_graph(labels)
+    n_nodes = int(rag.number_of_nodes)
+    rng = np.random.default_rng(0)
+
+    results: dict[str, dict] = {}
+    for n_other, tag in ((8, "dense8"), (2000, "sparse2000")):
+        other = rng.integers(0, n_other, size=labels.shape).astype(np.uint64)
+        other = np.ascontiguousarray(other)
+        for threads in (1, 4, 8):
+            def fn(t=threads, o=other):
+                bic.graph.features.accumulate_labels(rag, labels, o, number_of_threads=t)
+
+            key = f"accumulate_labels_{tag}_t{threads}"
+            results[key] = _timeit(fn, repeats)
+            results[key]["meta"] = {
+                "n_nodes": n_nodes,
+                "n_pixels": int(labels.size),
+                "n_other": n_other,
+                "threads": threads,
+            }
+    return results
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--repeats", type=int, default=11)
+    args = parser.parse_args()
+    results = run(repeats=args.repeats)
+    for name, r in results.items():
+        print(f"{name:<32} median={r['median'] * 1e3:8.3f} ms  min={r['min'] * 1e3:8.3f} ms")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/development/graph/benchmark_bfs.py b/development/graph/benchmark_bfs.py
new file mode 100644
index 0000000..02922e6
--- /dev/null
+++ b/development/graph/benchmark_bfs.py
@@ -0,0 +1,70 @@
+"""Benchmark the workspace-reused BFS path (``lifted_edges_from_node_labels``).
+
+The code review changed ``BfsWorkspace::reset`` from an O(N) clear of the
+visited/distance buffers to an O(1) generation-stamp bump. That only pays off
+when one workspace is reset across many sources, which is exactly what
+``lifted_edges_from_node_labels`` does (one workspace per chunk, reset per
+source). The single-call ``breadth_first_search`` builds a fresh workspace each
+call and would NOT show the change, so we benchmark the lifted-edge path.
+
+Single-threaded so the per-source reset cost is not hidden by parallelism. The
+node count is swept to expose the previous O(N^2)-of-memset behavior.
+
+Not part of the test suite. Run::
+
+    python development/graph/benchmark_bfs.py --repeats 5
+"""
+from __future__ import annotations
+
+import argparse
+from statistics import median
+from time import perf_counter
+
+import numpy as np
+
+import bioimage_cpp as bic
+
+
+def _timeit(fn, repeats: int, warmup: int = 1) -> dict:
+    for _ in range(warmup):
+        fn()
+    timings = []
+    for _ in range(repeats):
+        t0 = perf_counter()
+        fn()
+        timings.append(perf_counter() - t0)
+    return {"median": median(timings), "min": min(timings), "n": repeats}
+
+
+def run(repeats: int = 5, depth: int = 2) -> dict:
+    shapes = [(100, 100), (160, 160), (220, 220)]
+    results: dict[str, dict] = {}
+    for shape in shapes:
+        n_nodes = int(np.prod(shape))
+        graph = bic.graph.grid_graph(shape)
+        rng = np.random.default_rng(0)
+        node_labels = rng.integers(0, 50, size=(n_nodes,), dtype=np.uint64)
+
+        def fn(g=graph, nl=node_labels, d=depth):
+            bic.graph.lifted_multicut.lifted_edges_from_node_labels(
+                g, nl, graph_depth=d, number_of_threads=1
+            )
+
+        key = f"bfs_lifted_{n_nodes}nodes_d{depth}"
+        results[key] = _timeit(fn, repeats)
+        results[key]["meta"] = {"n_nodes": n_nodes, "shape": shape, "depth": depth}
+    return results
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--repeats", type=int, default=5)
+    parser.add_argument("--depth", type=int, default=2)
+    args = parser.parse_args()
+    results = run(repeats=args.repeats, depth=args.depth)
+    for name, r in results.items():
+        print(f"{name:<28} median={r['median'] * 1e3:9.3f} ms  min={r['min'] * 1e3:9.3f} ms")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/development/segmentation/benchmark_mutex_watershed.py b/development/segmentation/benchmark_mutex_watershed.py
new file mode 100644
index 0000000..d77abc4
--- /dev/null
+++ b/development/segmentation/benchmark_mutex_watershed.py
@@ -0,0 +1,89 @@
+"""Benchmark the mutex-watershed grid kernel (bic only).
+
+The code review routed the neighbor computation in ``mutex_watershed_grid``
+through ``detail::valid_offset_target`` (per-axis bounds check) instead of a
+single precomputed flat-offset add. This script times ``mutex_watershed`` on the
+ISBI affinities for 2D and 3D so we can A/B that inner-loop change against a
+pre-review build. No external (affogato) dependency — bic only.
+
+Not part of the test suite. Run::
+
+    python development/segmentation/benchmark_mutex_watershed.py --repeats 5
+"""
+from __future__ import annotations
+
+import argparse
+from statistics import median
+from time import perf_counter
+
+import numpy as np
+
+import bioimage_cpp as bic
+from bioimage_cpp._data import load_isbi_affinities
+
+
+def _timeit(fn, repeats: int, warmup: int = 1) -> dict:
+    for _ in range(warmup):
+        fn()
+    timings = []
+    for _ in range(repeats):
+        t0 = perf_counter()
+        fn()
+        timings.append(perf_counter() - t0)
+    return {"median": median(timings), "min": min(timings), "n": repeats}
+
+
+def _attractive_flip(affs: np.ndarray, n_attractive: int) -> np.ndarray:
+    # Match the convention in the equivalence checker: attractive channels are
+    # turned into merge affinities (1 - aff).
+    out = affs.copy()
+    out[:n_attractive] *= -1
+    out[:n_attractive] += 1
+    return out
+
+
+def run(repeats: int = 5) -> dict:
+    affinities, offsets = load_isbi_affinities()
+    affinities = np.ascontiguousarray(affinities)
+    offsets = [tuple(o) for o in offsets]
+    results: dict[str, dict] = {}
+
+    # --- 2D: in-plane channels of a single z slice ---
+    channels_2d = [i for i, o in enumerate(offsets) if o[0] == 0]
+    aff2d = np.ascontiguousarray(affinities[channels_2d, 0, :256, :256])
+    offsets_2d = [offsets[i][1:] for i in channels_2d]
+    aff2d_flipped = _attractive_flip(aff2d, 2)
+
+    def mws_2d():
+        bic.segmentation.mutex_watershed(
+            aff2d_flipped, offsets_2d, number_of_attractive_channels=2
+        )
+
+    results["mws_2d_256x256"] = _timeit(mws_2d, repeats)
+    results["mws_2d_256x256"]["meta"] = {"shape": list(aff2d.shape)}
+
+    # --- 3D: small crop, all offsets ---
+    aff3d = np.ascontiguousarray(affinities[:, :6, :256, :256])
+    aff3d_flipped = _attractive_flip(aff3d, 3)
+
+    def mws_3d():
+        bic.segmentation.mutex_watershed(
+            aff3d_flipped, offsets, number_of_attractive_channels=3
+        )
+
+    results["mws_3d_6x256x256"] = _timeit(mws_3d, repeats)
+    results["mws_3d_6x256x256"]["meta"] = {"shape": list(aff3d.shape)}
+    return results
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--repeats", type=int, default=5)
+    args = parser.parse_args()
+    results = run(repeats=args.repeats)
+    for name, r in results.items():
+        print(f"{name:<20} median={r['median'] * 1e3:9.3f} ms  min={r['min'] * 1e3:9.3f} ms")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/development/utils/benchmark_union_find.py b/development/utils/benchmark_union_find.py
new file mode 100644
index 0000000..8ed518e
--- /dev/null
+++ b/development/utils/benchmark_union_find.py
@@ -0,0 +1,70 @@
+"""Micro-benchmark for the UnionFind Python bindings.
+
+Times the bulk ``merge((N, 2) edges)`` and ``find(node_array)`` entry points,
+which the code review changed to validate every node id against ``uf.size``
+before touching the C++ structure (an extra O(N) pre-pass). This script lets us
+A/B that pre-pass against a pre-review build.
+
+Not part of the test suite. Run::
+
+    python development/utils/benchmark_union_find.py --repeats 11
+"""
+from __future__ import annotations
+
+import argparse
+from statistics import median
+from time import perf_counter
+
+import numpy as np
+
+import bioimage_cpp as bic
+
+
+def _timeit(fn, repeats: int, warmup: int = 1) -> dict:
+    for _ in range(warmup):
+        fn()
+    timings = []
+    for _ in range(repeats):
+        t0 = perf_counter()
+        fn()
+        timings.append(perf_counter() - t0)
+    return {"median": median(timings), "min": min(timings), "n": repeats}
+
+
+def run(repeats: int = 11, n_nodes: int = 1_000_000, n_edges: int = 2_000_000) -> dict:
+    rng = np.random.default_rng(0)
+    edges = rng.integers(0, n_nodes, size=(n_edges, 2), dtype=np.uint64)
+    query = rng.integers(0, n_nodes, size=(n_edges,), dtype=np.uint64)
+
+    # Pre-merged structure for the find benchmark (find does not mutate the
+    # partition, so it can be reused across repeats).
+    merged = bic.utils.UnionFind(n_nodes)
+    merged.merge(edges)
+
+    def bulk_merge():
+        uf = bic.utils.UnionFind(n_nodes)
+        uf.merge(edges)
+
+    def bulk_find():
+        merged.find(query)
+
+    results = {
+        "uf_bulk_merge": _timeit(bulk_merge, repeats),
+        "uf_bulk_find": _timeit(bulk_find, repeats),
+    }
+    for r in results.values():
+        r["meta"] = {"n_nodes": n_nodes, "n_edges": n_edges}
+    return results
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--repeats", type=int, default=11)
+    args = parser.parse_args()
+    results = run(repeats=args.repeats)
+    for name, r in results.items():
+        print(f"{name:<16} median={r['median'] * 1e3:8.3f} ms  min={r['min'] * 1e3:8.3f} ms")
+
+
+if __name__ == "__main__":
+    main()