From 552249cab15f1ed77bfecd94c31c03260f2fc3c9 Mon Sep 17 00:00:00 2001 From: Constantin Pape Date: Thu, 11 Jun 2026 23:55:00 +0200 Subject: [PATCH 1/7] Harden UnionFind bindings and mutex-watershed neighbor bounds B2: validate node ids at the UnionFind binding boundary. UnionFind::find/ merge/merge_to index their parent vectors without a bounds check (intentional for the hot path), but the scalar and array bindings passed user-supplied ids straight through, so an out-of-range id was undefined behavior / a segfault from Python. Add check_node() and route the scalar find/merge/merge_to binds and the merge_edges/find_nodes helpers through it, throwing std::invalid_argument before releasing the GIL. B1: route the mutex-watershed and semantic-mutex-watershed neighbor computation through detail::valid_offset_target instead of an unchecked flat-index shift. This bounds-checks the neighbor per axis, so the kernel is memory-safe and rejects row/plane wrap-around independently of the caller's valid_edges mask. Drops the now-unused offset_strides precompute. Add regression tests for the new UnionFind validation and for the existing binding-layer point-coordinate validation in non_maximum_distance_suppression. Co-Authored-By: Claude Fable 5 --- .../non_maximum_distance_suppression.hxx | 4 +- .../segmentation/mutex_watershed.hxx | 16 +++---- .../segmentation/semantic_mutex_watershed.hxx | 16 +++---- src/bindings/util.cxx | 46 +++++++++++++++++-- .../test_non_maximum_distance_suppression.py | 14 ++++++ tests/test_util_union_find.py | 30 ++++++++++++ 6 files changed, 102 insertions(+), 24 deletions(-) diff --git a/include/bioimage_cpp/non_maximum_distance_suppression.hxx b/include/bioimage_cpp/non_maximum_distance_suppression.hxx index 1239e73..3a0806c 100644 --- a/include/bioimage_cpp/non_maximum_distance_suppression.hxx +++ b/include/bioimage_cpp/non_maximum_distance_suppression.hxx @@ -81,7 +81,9 @@ inline void non_maximum_distance_suppression( const auto strides = bioimage_cpp::detail::c_order_strides(distance_map.shape); const auto n = static_cast(n_points); - // Precompute flat index and distance value at each point. + // Precompute flat index and distance value at each point. Point coordinates + // are validated against distance_map bounds in the binding layer before this + // is called (see bindings/distance.cxx). std::vector point_dist(n); for (std::size_t i = 0; i < n; ++i) { const auto *row = diff --git a/include/bioimage_cpp/segmentation/mutex_watershed.hxx b/include/bioimage_cpp/segmentation/mutex_watershed.hxx index 743cf1a..e75c645 100644 --- a/include/bioimage_cpp/segmentation/mutex_watershed.hxx +++ b/include/bioimage_cpp/segmentation/mutex_watershed.hxx @@ -78,13 +78,6 @@ void mutex_watershed_grid( )); const auto spatial_strides = detail::c_order_strides(spatial_shape); - std::vector offset_strides(number_of_channels, 0); - for (std::size_t channel = 0; channel < number_of_channels; ++channel) { - for (std::size_t axis = 0; axis < spatial_ndim; ++axis) { - offset_strides[channel] += offsets[channel][axis] * spatial_strides[axis]; - } - } - const auto number_of_edges = number_of_nodes * number_of_channels; std::vector> edge_order; edge_order.reserve(static_cast(number_of_edges)); @@ -108,8 +101,13 @@ void mutex_watershed_grid( const auto channel = static_cast(edge_id / number_of_nodes); const auto u = edge_id % number_of_nodes; - const auto v_signed = static_cast(u) + static_cast(offset_strides[channel]); - const auto v = static_cast(v_signed); + // Bounds-check the neighbor per axis rather than relying solely on the + // valid_edges mask: this keeps the kernel memory-safe and rejects edges + // that would wrap across a row/plane boundary. + std::uint64_t v = 0; + if (!detail::valid_offset_target(u, offsets[channel], spatial_shape, spatial_strides, v)) { + continue; + } std::uint64_t root_u = sets.find(u); std::uint64_t root_v = sets.find(v); if (root_u == root_v) { diff --git a/include/bioimage_cpp/segmentation/semantic_mutex_watershed.hxx b/include/bioimage_cpp/segmentation/semantic_mutex_watershed.hxx index 0db53d2..5c10f41 100644 --- a/include/bioimage_cpp/segmentation/semantic_mutex_watershed.hxx +++ b/include/bioimage_cpp/segmentation/semantic_mutex_watershed.hxx @@ -105,13 +105,6 @@ void semantic_mutex_watershed_grid( )); const auto spatial_strides = detail::c_order_strides(spatial_shape); - std::vector offset_strides(number_of_offsets, 0); - for (std::size_t channel = 0; channel < number_of_offsets; ++channel) { - for (std::size_t axis = 0; axis < spatial_ndim; ++axis) { - offset_strides[channel] += offsets[channel][axis] * spatial_strides[axis]; - } - } - struct WeightedGridEdge { T weight; std::uint64_t id; @@ -155,9 +148,12 @@ void semantic_mutex_watershed_grid( continue; } - const auto v_signed = static_cast(u) + - static_cast(offset_strides[channel]); - const auto v = static_cast(v_signed); + // Bounds-check the neighbor per axis (see mutex_watershed_grid): keeps + // the kernel memory-safe and rejects edges that wrap across a boundary. + std::uint64_t v = 0; + if (!detail::valid_offset_target(u, offsets[channel], spatial_shape, spatial_strides, v)) { + continue; + } const auto root_u = sets.find(u); const auto root_v = sets.find(v); if (root_u == root_v) { diff --git a/src/bindings/util.cxx b/src/bindings/util.cxx index 90a972e..519c87c 100644 --- a/src/bindings/util.cxx +++ b/src/bindings/util.cxx @@ -18,6 +18,18 @@ using EdgeArray = nb::ndarray; using NodeArray = nb::ndarray>; using OutputArray = nb::ndarray; +// UnionFind::find/merge/merge_to index their parent/rank vectors without a +// bounds check (intentionally, for the hot path). Validate node ids at the +// binding boundary so out-of-range ids raise a clear error instead of UB. +void check_node(const util::UnionFind &uf, const std::uint64_t node, const char *name) { + if (node >= uf.size()) { + throw std::invalid_argument( + std::string(name) + " out of range: got " + name + "=" + + std::to_string(node) + ", size=" + std::to_string(uf.size()) + ); + } +} + void merge_edges( util::UnionFind &uf, EdgeArray edges @@ -39,6 +51,11 @@ void merge_edges( const auto n_edges = edges.shape(0); const auto *data = edges.data(); + for (std::size_t i = 0; i < n_edges; ++i) { + check_node(uf, data[2 * i], "edge endpoint"); + check_node(uf, data[2 * i + 1], "edge endpoint"); + } + { nb::gil_scoped_release release; for (std::size_t i = 0; i < n_edges; ++i) { @@ -49,10 +66,14 @@ void merge_edges( OutputArray find_nodes(util::UnionFind &uf, NodeArray nodes) { const auto n = nodes.shape(0); + const auto *input = nodes.data(); + for (std::size_t i = 0; i < n; ++i) { + check_node(uf, input[i], "node"); + } + auto *out = new std::uint64_t[n](); nb::capsule owner(out, [](void *p) noexcept { delete[] static_cast(p); }); - const auto *input = nodes.data(); { nb::gil_scoped_release release; for (std::size_t i = 0; i < n; ++i) { @@ -64,6 +85,23 @@ OutputArray find_nodes(util::UnionFind &uf, NodeArray nodes) { return OutputArray(out, 1, shape, owner); } +std::uint64_t find_node(util::UnionFind &uf, const std::uint64_t node) { + check_node(uf, node, "node"); + return uf.find(node); +} + +std::uint64_t merge_pair(util::UnionFind &uf, const std::uint64_t first, const std::uint64_t second) { + check_node(uf, first, "first"); + check_node(uf, second, "second"); + return uf.merge(first, second); +} + +std::uint64_t merge_to_node(util::UnionFind &uf, const std::uint64_t stable, const std::uint64_t removed) { + check_node(uf, stable, "stable"); + check_node(uf, removed, "removed"); + return uf.merge_to(stable, removed); +} + OutputArray element_labeling(util::UnionFind &uf) { const auto n = uf.size(); auto *out = new std::uint64_t[n](); @@ -98,7 +136,7 @@ void bind_util(nb::module_ &m) { ) .def( "find", - &util::UnionFind::find, + &find_node, nb::arg("node"), "Return the (path-compressed) root of `node`." ) @@ -110,7 +148,7 @@ void bind_util(nb::module_ &m) { ) .def( "merge", - &util::UnionFind::merge, + &merge_pair, nb::arg("first"), nb::arg("second"), "Union the sets containing `first` and `second`. Returns the new root." @@ -123,7 +161,7 @@ void bind_util(nb::module_ &m) { ) .def( "merge_to", - &util::UnionFind::merge_to, + &merge_to_node, nb::arg("stable"), nb::arg("removed"), "Union the sets containing `stable` and `removed`, forcing " diff --git a/tests/distance/test_non_maximum_distance_suppression.py b/tests/distance/test_non_maximum_distance_suppression.py index a152064..194e6d3 100644 --- a/tests/distance/test_non_maximum_distance_suppression.py +++ b/tests/distance/test_non_maximum_distance_suppression.py @@ -33,6 +33,20 @@ def test_two_close_points_keeps_higher_value(): assert out.tolist() == [[5, 5]] +def test_out_of_range_point_coordinate_raises(): + dm = np.zeros((10, 10), dtype=np.float32) + pts = np.array([[5, 5], [5, 12]], dtype=np.int64) + with pytest.raises(ValueError, match="out of bounds"): + nms(dm, pts) + + +def test_negative_point_coordinate_raises(): + dm = np.zeros((10, 10), dtype=np.float32) + pts = np.array([[-1, 5]], dtype=np.int64) + with pytest.raises(ValueError, match="out of bounds"): + nms(dm, pts) + + def test_two_far_points_both_survive(): dm = np.zeros((20, 20), dtype=np.float32) dm[2, 2] = 1.0 diff --git a/tests/test_util_union_find.py b/tests/test_util_union_find.py index 97123bd..9e792b2 100644 --- a/tests/test_util_union_find.py +++ b/tests/test_util_union_find.py @@ -141,3 +141,33 @@ def test_bulk_merge_rejects_three_columns(): uf = bic.utils.UnionFind(3) with pytest.raises(Exception, match=r"\(N, 2\)"): uf.merge(np.zeros((2, 3), dtype=np.uint64)) + + +def test_scalar_find_rejects_out_of_range_node(): + uf = bic.utils.UnionFind(3) + with pytest.raises(ValueError, match="out of range"): + uf.find(3) + + +def test_scalar_merge_rejects_out_of_range_node(): + uf = bic.utils.UnionFind(3) + with pytest.raises(ValueError, match="out of range"): + uf.merge(0, 5) + + +def test_merge_to_rejects_out_of_range_node(): + uf = bic.utils.UnionFind(3) + with pytest.raises(ValueError, match="out of range"): + uf.merge_to(7, 0) + + +def test_bulk_find_rejects_out_of_range_node(): + uf = bic.utils.UnionFind(3) + with pytest.raises(ValueError, match="out of range"): + uf.find(np.array([0, 1, 9], dtype=np.uint64)) + + +def test_bulk_merge_rejects_out_of_range_node(): + uf = bic.utils.UnionFind(3) + with pytest.raises(ValueError, match="out of range"): + uf.merge(np.array([[0, 1], [2, 8]], dtype=np.uint64)) From 38d395ed17470163d6baa7d9581edaa88a7613ea Mon Sep 17 00:00:00 2001 From: Constantin Pape Date: Thu, 11 Jun 2026 23:58:31 +0200 Subject: [PATCH 2/7] Fix determinism issues in sorting, proposal seeding, and rounding B3: argsort_by_first broke ties on the primary key nondeterministically (std::sort is unstable). Add a secondary tie-break by original index so the ordering is a total order. This makes the label-multiset downsample argmax and restrict_set truncation deterministic on count ties (smaller id wins). B4: GreedyAdditiveMulticutProposalGenerator derived its per-call seed as seed_ + call_count_, which collided with the fusion-move driver's per-slot seed + slot offset (different (slot, iteration) cells computed identical proposals). Mix the packed (seed, counter) pair through a SplitMix64 finalizer to remove the linearity. B7: WatershedProposalGenerator::reset re-seeded the RNG but left the cached normal-distribution deviate, so a reset generator was not identical to a fresh one. Add noise_.reset(). flow: round_to_flat_index used std::nearbyint (FP-rounding-mode dependent, half-to-even); switch to floor(x + 0.5) to match the half-up convention used in affine.hxx and watershed.hxx. Add a determinism regression test for the downsample count-tie case. Co-Authored-By: Claude Fable 5 --- include/bioimage_cpp/flow/flow_density.hxx | 5 ++++- .../greedy_additive_multicut.hxx | 19 ++++++++++++++++++- .../graph/proposal_generators/watershed.hxx | 3 +++ .../bioimage_cpp/label_multiset/multiset.hxx | 14 ++++++++++++-- tests/label_multiset/test_downsample.py | 18 ++++++++++++++++++ 5 files changed, 55 insertions(+), 4 deletions(-) diff --git a/include/bioimage_cpp/flow/flow_density.hxx b/include/bioimage_cpp/flow/flow_density.hxx index 862cfda..60cbf2d 100644 --- a/include/bioimage_cpp/flow/flow_density.hxx +++ b/include/bioimage_cpp/flow/flow_density.hxx @@ -109,7 +109,10 @@ inline std::ptrdiff_t round_to_flat_index( } else if (clipped > grid.upper[axis]) { clipped = grid.upper[axis]; } - const auto coord = static_cast(std::nearbyint(clipped)); + // Round half up, matching the nearest-neighbor convention in + // transformation/affine.hxx and segmentation/watershed.hxx. + // std::nearbyint would honor the FP rounding mode (round-half-to-even). + const auto coord = static_cast(std::floor(clipped + 0.5f)); flat += coord * grid.strides[axis]; } return flat; diff --git a/include/bioimage_cpp/graph/proposal_generators/greedy_additive_multicut.hxx b/include/bioimage_cpp/graph/proposal_generators/greedy_additive_multicut.hxx index 4408f7f..d0dc15b 100644 --- a/include/bioimage_cpp/graph/proposal_generators/greedy_additive_multicut.hxx +++ b/include/bioimage_cpp/graph/proposal_generators/greedy_additive_multicut.hxx @@ -54,7 +54,7 @@ public: weight_stop_, node_num_stop_, true, - seed_ + static_cast(call_count_), + mixed_seed(seed_, call_count_), sigma_ ); ++call_count_; @@ -65,6 +65,23 @@ public: } private: + // Mix the base seed and the call counter so proposals from different + // (slot, iteration) cells never realign. A plain `seed_ + call_count_` + // collided with the per-slot `seed + slot` offset applied by the fusion-move + // driver (slot 0 at iteration i+1 reused the seed of slot 1 at iteration i), + // recomputing identical proposals and shrinking effective diversity. A + // SplitMix64 finalizer over the packed (seed, counter) pair removes the + // linearity. + static int mixed_seed(const int base, const std::size_t counter) { + std::uint64_t z = + (static_cast(static_cast(base)) << 32) | + static_cast(static_cast(counter)); + z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ULL; + z = (z ^ (z >> 27)) * 0x94D049BB133111EBULL; + z = z ^ (z >> 31); + return static_cast(static_cast(z)); + } + const UndirectedGraph &graph_; std::vector edge_costs_; double sigma_; diff --git a/include/bioimage_cpp/graph/proposal_generators/watershed.hxx b/include/bioimage_cpp/graph/proposal_generators/watershed.hxx index 541c6f1..ed31c8c 100644 --- a/include/bioimage_cpp/graph/proposal_generators/watershed.hxx +++ b/include/bioimage_cpp/graph/proposal_generators/watershed.hxx @@ -109,6 +109,9 @@ public: void reset() override { generator_.seed(static_cast(seed_)); + // std::normal_distribution caches a second Box-Muller deviate; clear it + // so a reset generator is bytewise identical to a freshly constructed one. + noise_.reset(); } private: diff --git a/include/bioimage_cpp/label_multiset/multiset.hxx b/include/bioimage_cpp/label_multiset/multiset.hxx index 53e97a1..bc9cda2 100644 --- a/include/bioimage_cpp/label_multiset/multiset.hxx +++ b/include/bioimage_cpp/label_multiset/multiset.hxx @@ -24,12 +24,22 @@ template inline void argsort_by_first(V1 &v1, V2 &v2, const bool ascending = true) { std::vector idx(v1.size()); std::iota(idx.begin(), idx.end(), 0); + // Break ties on the primary key by original index so the ordering is a + // total order. std::sort is unstable, so without this tie-break equal keys + // (e.g. equal counts when sorting label entries) would be reordered + // nondeterministically across runs / STL implementations. if (ascending) { std::sort(idx.begin(), idx.end(), - [&v1](std::size_t a, std::size_t b) { return v1[a] < v1[b]; }); + [&v1](std::size_t a, std::size_t b) { + if (v1[a] != v1[b]) return v1[a] < v1[b]; + return a < b; + }); } else { std::sort(idx.begin(), idx.end(), - [&v1](std::size_t a, std::size_t b) { return v1[a] > v1[b]; }); + [&v1](std::size_t a, std::size_t b) { + if (v1[a] != v1[b]) return v1[a] > v1[b]; + return a < b; + }); } reorder_inplace(v1, idx); reorder_inplace(v2, idx); diff --git a/tests/label_multiset/test_downsample.py b/tests/label_multiset/test_downsample.py index a0242b1..61ba50b 100644 --- a/tests/label_multiset/test_downsample.py +++ b/tests/label_multiset/test_downsample.py @@ -123,3 +123,21 @@ def test_downsample_restrict_set_keeps_top_k(): assert ids.tolist() == [0] assert counts.tolist() == [15] assert ms_top1.argmax.tolist() == [0] + + +def test_downsample_restrict_set_count_tie_is_deterministic(): + # Two labels with equal counts (8 each). With restrict_set=1 and a count + # tie, the argmax / retained entry must be deterministic: the smaller id + # wins (the sort breaks ties by id). Run twice to confirm stability. + labels = np.empty((4, 4), dtype=np.uint64) + labels[:2, :] = 1 + labels[2:, :] = 2 + ms_full = multiset_from_labels(labels, (1, 1)) + b = Blocking([0, 0], [4, 4], [4, 4]) + results = [] + for _ in range(2): + ms_top1 = downsample_multiset(ms_full, b, restrict_set=1) + ids, counts = ms_top1.entry(0) + results.append((ms_top1.argmax.tolist(), ids.tolist(), counts.tolist())) + assert results[0] == results[1] + assert results[0] == ([1], [1], [8]) From 19a19423269ec1f446f0ed4e9501800a1adf8bc7 Mon Sep 17 00:00:00 2001 From: Constantin Pape Date: Fri, 12 Jun 2026 00:00:38 +0200 Subject: [PATCH 3/7] Fix BIOIMAGE_PROFILE_SCOPE to generate unique identifiers `_bp_##__LINE__` does not expand __LINE__ (token-paste suppresses operand expansion), so every BIOIMAGE_PROFILE_SCOPE declared the same identifier `_bp___LINE__`. It compiled only because each call site happened to wrap its scope in its own block; two scopes in one block (the documented per-phase pattern) would be a redefinition error in BIOIMAGE_PROFILE=ON builds. Add the standard two-level concat indirection so __LINE__ is expanded first. Co-Authored-By: Claude Fable 5 --- include/bioimage_cpp/detail/profile.hxx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/bioimage_cpp/detail/profile.hxx b/include/bioimage_cpp/detail/profile.hxx index 124ab4c..6f1f7ac 100644 --- a/include/bioimage_cpp/detail/profile.hxx +++ b/include/bioimage_cpp/detail/profile.hxx @@ -89,7 +89,12 @@ inline ProfileTimerNull make_profile_timer(NullProfiler &profiler, const char *n } // namespace bioimage_cpp::detail #define BIOIMAGE_PROFILE_INIT(var) ::bioimage_cpp::detail::Profiler var; -#define BIOIMAGE_PROFILE_SCOPE(var, name) auto _bp_##__LINE__ = ::bioimage_cpp::detail::make_profile_timer(var, name); +// Two-level indirection so __LINE__ is expanded before the token paste; without +// it every BIOIMAGE_PROFILE_SCOPE in a translation unit would declare the same +// identifier `_bp___LINE__`, breaking two scopes in one block. +#define BIOIMAGE_PROFILE_CONCAT_(a, b) a##b +#define BIOIMAGE_PROFILE_CONCAT(a, b) BIOIMAGE_PROFILE_CONCAT_(a, b) +#define BIOIMAGE_PROFILE_SCOPE(var, name) auto BIOIMAGE_PROFILE_CONCAT(_bp_, __LINE__) = ::bioimage_cpp::detail::make_profile_timer(var, name); #define BIOIMAGE_PROFILE_REPORT(var) (var).report(); #else From 1946cb4066efe9ecbf939b4f0eca2b1a3f5b8155 Mon Sep 17 00:00:00 2001 From: Constantin Pape Date: Fri, 12 Jun 2026 00:06:16 +0200 Subject: [PATCH 4/7] Performance: drop redundant work in greedy-additive, BFS, label accumulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I1: GreedyAdditiveWorkspace::reset called heap.reset_capacity, and so did detail::initialize_dynamic_graph which every caller runs immediately after. The second call re-wiped the heap locator vector (O(E)) on every solve — hot under fusion-move (per fuse x slot x iteration). Drop it from reset() in both the multicut and lifted-multicut workspaces; initialize_dynamic_graph owns it. I3: BfsWorkspace::reset cleared the visited/distance buffers (O(N) over the whole graph) on every call, making the "k-hop neighborhood from every node" pattern O(N^2) of memset. Replace the boolean visited buffer with a uint32 generation stamp: reset just increments the generation (O(1)), clearing only on the rare wraparound. distance_ is sized but written before read. I4: accumulate_labels used an n_threads x n_nodes map-of-maps plus a per-node merge map. Switch to one combined-key (node, other) histogram per thread, merged once, with a single-pass per-node argmax. Far fewer container allocations; the argmax/tie-break/empty-node semantics are unchanged (covered by the existing accumulate_labels tests, incl. parallel-vs-single and nifty cross-checks). Co-Authored-By: Claude Fable 5 --- .../graph/breadth_first_search.hxx | 39 ++++++-- .../bioimage_cpp/graph/label_accumulation.hxx | 94 +++++++++++-------- .../graph/lifted_multicut/greedy_additive.hxx | 4 +- .../graph/multicut/greedy_additive.hxx | 4 +- 4 files changed, 93 insertions(+), 48 deletions(-) diff --git a/include/bioimage_cpp/graph/breadth_first_search.hxx b/include/bioimage_cpp/graph/breadth_first_search.hxx index 0f140d3..8255d25 100644 --- a/include/bioimage_cpp/graph/breadth_first_search.hxx +++ b/include/bioimage_cpp/graph/breadth_first_search.hxx @@ -2,6 +2,7 @@ #include "bioimage_cpp/graph/undirected_graph.hxx" +#include #include #include #include @@ -19,23 +20,44 @@ struct BfsEntry { }; // Reusable scratch state for `breadth_first_search`. Reset via `reset(graph)` -// before each call so the visited-flag and distance buffers grow once and stay +// before each call so the visited and distance buffers grow once and stay // allocated across many BFS invocations on the same graph. +// +// Visited tracking uses a per-call generation stamp rather than a boolean +// buffer cleared each call: `reset` just increments the generation (O(1)), +// doing a full clear only on the rare 32-bit wraparound. This matters for the +// "k-hop neighborhood from every node" pattern, where an O(N)-per-call clear +// would otherwise make the whole sweep O(N^2). `distance_` is sized but not +// cleared; it is written before being read for every visited node. class BfsWorkspace { public: BfsWorkspace() = default; void reset(const UndirectedGraph &graph) { const auto n_nodes = static_cast(graph.number_of_nodes()); - visited_.assign(n_nodes, 0); - distance_.assign(n_nodes, 0); + if (visited_.size() != n_nodes) { + visited_.assign(n_nodes, 0); + generation_ = 0; + } + distance_.resize(n_nodes); + if (generation_ == std::numeric_limits::max()) { + std::fill(visited_.begin(), visited_.end(), std::uint32_t{0}); + generation_ = 0; + } + ++generation_; } - [[nodiscard]] std::vector &visited() { return visited_; } + [[nodiscard]] bool is_visited(const std::uint64_t node) const { + return visited_[static_cast(node)] == generation_; + } + void mark_visited(const std::uint64_t node) { + visited_[static_cast(node)] = generation_; + } [[nodiscard]] std::vector &distance() { return distance_; } private: - std::vector visited_; + std::vector visited_; + std::uint32_t generation_ = 0; std::vector distance_; }; @@ -71,13 +93,12 @@ inline std::vector breadth_first_search( ); } workspace.reset(graph); - auto &visited = workspace.visited(); auto &distance = workspace.distance(); std::vector result; std::queue queue; queue.push(source); - visited[static_cast(source)] = 1; + workspace.mark_visited(source); distance[static_cast(source)] = 0; if (include_source) { result.push_back({source, 0}); @@ -93,10 +114,10 @@ inline std::vector breadth_first_search( const auto next_distance = node_distance + 1; for (const auto adjacency : graph.node_adjacency(node)) { const auto neighbor = adjacency.node; - if (visited[static_cast(neighbor)] != 0) { + if (workspace.is_visited(neighbor)) { continue; } - visited[static_cast(neighbor)] = 1; + workspace.mark_visited(neighbor); distance[static_cast(neighbor)] = next_distance; result.push_back({neighbor, next_distance}); queue.push(neighbor); diff --git a/include/bioimage_cpp/graph/label_accumulation.hxx b/include/bioimage_cpp/graph/label_accumulation.hxx index cb19b5e..5df6a85 100644 --- a/include/bioimage_cpp/graph/label_accumulation.hxx +++ b/include/bioimage_cpp/graph/label_accumulation.hxx @@ -26,6 +26,32 @@ inline std::size_t number_of_pixels(const std::vector &shape) { )); } +// Combined (node, other) histogram key. A single map per thread keyed by this +// pair avoids the n_threads * n_nodes map-of-maps (one std::unordered_map per +// node per thread), which dominates allocation for RAGs with many nodes. +template +struct NodeOther { + std::uint64_t node; + OtherT other; + bool operator==(const NodeOther &rhs) const noexcept { + return node == rhs.node && other == rhs.other; + } +}; + +template +struct NodeOtherHash { + std::size_t operator()(const NodeOther &key) const noexcept { + std::uint64_t h = key.node * 0x9E3779B97F4A7C15ULL; + h ^= static_cast(key.other) + 0x9E3779B97F4A7C15ULL + + (h << 6) + (h >> 2); + return static_cast(h); + } +}; + +template +using NodeOtherHistogram = + std::unordered_map, std::uint64_t, NodeOtherHash>; + template void scan_chunk( const LabelT *labels, @@ -34,7 +60,7 @@ void scan_chunk( const std::size_t pixel_end, const bool has_ignore_value, const OtherT ignore_value, - std::vector> &histograms + NodeOtherHistogram &histogram ) { for (std::size_t index = pixel_begin; index < pixel_end; ++index) { const auto other = other_labels[index]; @@ -42,7 +68,7 @@ void scan_chunk( continue; } const auto node = detail::checked_label_to_node(labels[index]); - ++histograms[static_cast(node)][other]; + ++histogram[NodeOther{static_cast(node), other}]; } } @@ -81,10 +107,7 @@ void accumulate_labels( const auto n_pixels = detail_label_accumulation::number_of_pixels(labels.shape); const auto n_threads = detail::normalize_thread_count(number_of_threads, n_pixels); - std::vector>> per_thread( - n_threads, - std::vector>(number_of_nodes) - ); + std::vector> per_thread(n_threads); bioimage_cpp::detail::parallel_for_chunks( n_threads, @@ -102,38 +125,35 @@ void accumulate_labels( } ); - // Merge per-thread histograms and pick majority per node in one pass over - // nodes. This is embarrassingly parallel across nodes. - const auto node_threads = detail::normalize_thread_count(number_of_threads, number_of_nodes); - bioimage_cpp::detail::parallel_for_chunks( - node_threads, - number_of_nodes, - [&](const std::size_t, const std::size_t node_begin, const std::size_t node_end) { - for (std::size_t node = node_begin; node < node_end; ++node) { - std::unordered_map merged; - for (auto &thread_histograms : per_thread) { - auto &node_hist = thread_histograms[node]; - for (const auto &entry : node_hist) { - merged[entry.first] += entry.second; - } - node_hist.clear(); - } - OtherT best_label = OtherT{0}; - std::uint64_t best_count = 0; - bool has_best = false; - for (const auto &entry : merged) { - if (!has_best || - entry.second > best_count || - (entry.second == best_count && entry.first < best_label)) { - best_label = entry.first; - best_count = entry.second; - has_best = true; - } - } - out.data[node] = has_best ? best_label : OtherT{0}; - } + // Merge the per-thread histograms into one, then pick the majority `other` + // label per node (smaller label wins ties; nodes with no pixels stay 0). + // The argmax pass is single-threaded over distinct (node, other) pairs, + // which is cheap relative to the parallel pixel scan above. + auto &merged = per_thread[0]; + for (std::size_t thread_id = 1; thread_id < per_thread.size(); ++thread_id) { + for (const auto &entry : per_thread[thread_id]) { + merged[entry.first] += entry.second; } - ); + detail_label_accumulation::NodeOtherHistogram().swap(per_thread[thread_id]); + } + + for (std::size_t node = 0; node < number_of_nodes; ++node) { + out.data[node] = OtherT{0}; + } + std::vector best_count(number_of_nodes, 0); + std::vector has_best(number_of_nodes, 0); + for (const auto &entry : merged) { + const auto node = static_cast(entry.first.node); + const auto other = entry.first.other; + const auto count = entry.second; + if (has_best[node] == 0 || + count > best_count[node] || + (count == best_count[node] && other < out.data[node])) { + out.data[node] = other; + best_count[node] = count; + has_best[node] = 1; + } + } } } // namespace bioimage_cpp::graph diff --git a/include/bioimage_cpp/graph/lifted_multicut/greedy_additive.hxx b/include/bioimage_cpp/graph/lifted_multicut/greedy_additive.hxx index f694f1f..5e47917 100644 --- a/include/bioimage_cpp/graph/lifted_multicut/greedy_additive.hxx +++ b/include/bioimage_cpp/graph/lifted_multicut/greedy_additive.hxx @@ -19,7 +19,9 @@ struct GreedyAdditiveWorkspace { void reset(const UndirectedGraph &lifted_graph) { dynamic_graph.reset(lifted_graph); union_find.reset(static_cast(lifted_graph.number_of_nodes())); - heap.reset_capacity(static_cast(lifted_graph.number_of_edges())); + // The heap is sized by detail::initialize_dynamic_graph, which every + // caller runs immediately after reset(); resizing it here too would + // wipe the locator vector twice. } }; diff --git a/include/bioimage_cpp/graph/multicut/greedy_additive.hxx b/include/bioimage_cpp/graph/multicut/greedy_additive.hxx index cafc06d..259a1c6 100644 --- a/include/bioimage_cpp/graph/multicut/greedy_additive.hxx +++ b/include/bioimage_cpp/graph/multicut/greedy_additive.hxx @@ -21,7 +21,9 @@ struct GreedyAdditiveWorkspace { void reset(const UndirectedGraph &graph) { dynamic_graph.reset(graph); union_find.reset(static_cast(graph.number_of_nodes())); - heap.reset_capacity(static_cast(graph.number_of_edges())); + // The heap is sized by detail::initialize_dynamic_graph, which every + // caller runs immediately after reset(); resizing it here too would + // wipe the locator vector twice. } }; From 67c8c88fcb82d772ee14a5090cae8e6c351bccb4 Mon Sep 17 00:00:00 2001 From: Constantin Pape Date: Fri, 12 Jun 2026 00:11:51 +0200 Subject: [PATCH 5/7] Deduplicate number_of_pixels and fix doc/naming inconsistencies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit R1: three byte-identical number_of_pixels helpers (label_accumulation, feature_accumulation, node_label_projection) — the feature_accumulation copy was dead. Add detail::number_of_elements to detail/grid.hxx and route the two live call sites through it; drop the dead copy. R9: drop the redundant nb::cast wrappers inside nb::make_tuple in relabel_sequential_t (make_tuple already casts its arguments). Docs/naming: - AGENTS.md: correct the union-find path (util/union_find.hxx, namespace bioimage_cpp::util, not detail/) and list number_of_elements under grid.hxx. - agglomeration/detail.hxx: the rekey comment described the opposite of the code; reword to explain that a kRejectEdge'd edge is deliberately left out of the heap. - filters: the public kwarg was window_ratio while the Python API documents window_size; align the nb::arg keyword and the validation message to window_size (calls are positional, so behavior is unchanged). - feature_accumulation: note that float feature values are bit-reproducible only for a fixed thread count. Co-Authored-By: Claude Fable 5 --- AGENTS.md | 4 +-- include/bioimage_cpp/detail/grid.hxx | 10 ++++++ .../graph/agglomeration/detail.hxx | 5 +-- .../graph/feature_accumulation.hxx | 14 +++----- .../bioimage_cpp/graph/label_accumulation.hxx | 13 ++------ .../graph/node_label_projection.hxx | 13 ++------ src/bindings/filters.cxx | 32 +++++++++---------- src/bindings/segmentation.cxx | 6 +--- 8 files changed, 41 insertions(+), 56 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 9f0e0f8..1d00dae 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -24,10 +24,10 @@ Required in the environment: `scikit-build-core`, `cmake>=3.21`, `ninja`, `nanob Before introducing union-finds, priority queues, edge hashing, stride math, threading helpers, or label relabeling, check `include/bioimage_cpp/detail/`: -- `detail/union_find.hxx` — `UnionFind` (path compression + union by rank). `find`, `merge`, `merge_to`, `unite_roots`. +- `util/union_find.hxx` — `UnionFind` (namespace `bioimage_cpp::util`, not `detail`; path compression + union by rank). `find`, `merge`, `merge_to`, `unite_roots`. - `detail/indexed_heap.hxx` — addressable max-heap with mutable priorities. `DenseIndexedHeap` (vector-backed locator, integer keys in `[0, N)`) and `SparseIndexedHeap` (hashmap-backed, arbitrary keys). - `detail/edge_hash.hxx` — `Edge`, `edge_key`, `EdgeHash` for hashing unordered node pairs. -- `detail/grid.hxx` — `c_order_strides`, `valid_offset_target`, `is_valid_grid_edge` for row-major grid offsets. +- `detail/grid.hxx` — `number_of_elements`, `c_order_strides`, `valid_offset_target`, `is_valid_grid_edge` for row-major grid shapes/offsets. - `detail/relabel.hxx` — `dense_relabel` to map arbitrary labels onto `[0, k)` preserving first-occurrence order. - `detail/threading.hxx` — `normalize_thread_count`, `parallel_for_chunks(n_threads, n_items, chunk)`. diff --git a/include/bioimage_cpp/detail/grid.hxx b/include/bioimage_cpp/detail/grid.hxx index 4c0b4f8..d6afc76 100644 --- a/include/bioimage_cpp/detail/grid.hxx +++ b/include/bioimage_cpp/detail/grid.hxx @@ -6,6 +6,16 @@ namespace bioimage_cpp::detail { +// Total number of elements in a row-major array of the given shape (the product +// of the per-axis extents). Shape entries are assumed non-negative. +inline std::size_t number_of_elements(const std::vector &shape) { + std::size_t total = 1; + for (const auto extent : shape) { + total *= static_cast(extent); + } + return total; +} + // C-order strides for a row-major array of the given shape, in units of array // elements (not bytes). The innermost (last) axis has stride 1. inline std::vector c_order_strides(const std::vector &shape) { diff --git a/include/bioimage_cpp/graph/agglomeration/detail.hxx b/include/bioimage_cpp/graph/agglomeration/detail.hxx index 2bf453c..92c97f3 100644 --- a/include/bioimage_cpp/graph/agglomeration/detail.hxx +++ b/include/bioimage_cpp/graph/agglomeration/detail.hxx @@ -93,8 +93,9 @@ inline std::size_t agglo_merge_dynamic_nodes( if (new_priority != current_priority) { edge.weight = new_priority; // The edge may have been previously popped via kRejectEdge - // (GASP cannot-link), in which case it is no longer in the - // heap. Use push_or_change to handle both cases. + // (GASP cannot-link). In that case it must stay out of the heap + // (re-adding it would let it be re-popped and merged), so only + // update the priority when the edge is still present. if (heap.contains(removed_edge_id)) { heap.change(removed_edge_id, new_priority); } diff --git a/include/bioimage_cpp/graph/feature_accumulation.hxx b/include/bioimage_cpp/graph/feature_accumulation.hxx index 4b2db1f..fbdead4 100644 --- a/include/bioimage_cpp/graph/feature_accumulation.hxx +++ b/include/bioimage_cpp/graph/feature_accumulation.hxx @@ -88,15 +88,6 @@ inline double percentile(std::vector &values, const double percentile) { return (1.0 - weight) * lower + weight * upper; } -inline std::size_t number_of_pixels(const std::vector &shape) { - return static_cast(std::accumulate( - shape.begin(), - shape.end(), - std::ptrdiff_t{1}, - [](const std::ptrdiff_t a, const std::ptrdiff_t b) { return a * b; } - )); -} - inline void require_same_spatial_shape( const std::vector &labels_shape, const std::vector &data_shape, @@ -370,6 +361,11 @@ void scan_affinity_3d_chunk( } } +// Combine per-thread partial statistics. count/min/max/percentile results are +// exact regardless of thread count; the floating-point sum / sum-of-squares +// (and hence mean / std) can differ in the last ULP between thread counts, +// because each thread sums a different subset of pixels. Float feature values +// are therefore bit-reproducible only for a fixed thread count. template std::vector merge_stats( std::vector> &per_thread_stats, diff --git a/include/bioimage_cpp/graph/label_accumulation.hxx b/include/bioimage_cpp/graph/label_accumulation.hxx index 5df6a85..d0ed3d4 100644 --- a/include/bioimage_cpp/graph/label_accumulation.hxx +++ b/include/bioimage_cpp/graph/label_accumulation.hxx @@ -1,13 +1,13 @@ #pragma once #include "bioimage_cpp/array_view.hxx" +#include "bioimage_cpp/detail/grid.hxx" #include "bioimage_cpp/detail/threading.hxx" #include "bioimage_cpp/graph/node_label_projection.hxx" #include "bioimage_cpp/graph/region_adjacency_graph.hxx" #include #include -#include #include #include #include @@ -17,15 +17,6 @@ namespace bioimage_cpp::graph { namespace detail_label_accumulation { -inline std::size_t number_of_pixels(const std::vector &shape) { - return static_cast(std::accumulate( - shape.begin(), - shape.end(), - std::ptrdiff_t{1}, - [](const std::ptrdiff_t a, const std::ptrdiff_t b) { return a * b; } - )); -} - // Combined (node, other) histogram key. A single map per thread keyed by this // pair avoids the n_threads * n_nodes map-of-maps (one std::unordered_map per // node per thread), which dominates allocation for RAGs with many nodes. @@ -104,7 +95,7 @@ void accumulate_labels( throw std::invalid_argument("labels contain a node id outside the rag"); } - const auto n_pixels = detail_label_accumulation::number_of_pixels(labels.shape); + const auto n_pixels = bioimage_cpp::detail::number_of_elements(labels.shape); const auto n_threads = detail::normalize_thread_count(number_of_threads, n_pixels); std::vector> per_thread(n_threads); diff --git a/include/bioimage_cpp/graph/node_label_projection.hxx b/include/bioimage_cpp/graph/node_label_projection.hxx index afc8439..d784da3 100644 --- a/include/bioimage_cpp/graph/node_label_projection.hxx +++ b/include/bioimage_cpp/graph/node_label_projection.hxx @@ -1,12 +1,12 @@ #pragma once #include "bioimage_cpp/array_view.hxx" +#include "bioimage_cpp/detail/grid.hxx" #include "bioimage_cpp/detail/threading.hxx" #include "bioimage_cpp/graph/region_adjacency_graph.hxx" #include #include -#include #include #include #include @@ -15,15 +15,6 @@ namespace bioimage_cpp::graph { namespace detail_projection { -inline std::size_t number_of_pixels(const std::vector &shape) { - return static_cast(std::accumulate( - shape.begin(), - shape.end(), - std::ptrdiff_t{1}, - [](const std::ptrdiff_t a, const std::ptrdiff_t b) { return a * b; } - )); -} - inline void require_rag_shape_matches_labels( const RegionAdjacencyGraph &rag, const std::vector &labels_shape @@ -69,7 +60,7 @@ void project_node_labels_to_pixels( throw std::invalid_argument("labels contain a node id outside node_labels"); } - const auto n_pixels = detail_projection::number_of_pixels(labels.shape); + const auto n_pixels = bioimage_cpp::detail::number_of_elements(labels.shape); const auto n_threads = detail::normalize_thread_count(number_of_threads, n_pixels); bioimage_cpp::detail::parallel_for_chunks( n_threads, diff --git a/src/bindings/filters.cxx b/src/bindings/filters.cxx index 396cb63..874d56b 100644 --- a/src/bindings/filters.cxx +++ b/src/bindings/filters.cxx @@ -43,11 +43,11 @@ void require_order(int order, const char *name, const char *function) { } } -void require_non_negative_window(double window_ratio, const char *function) { - if (window_ratio < 0.0) { +void require_non_negative_window(double window_size, const char *function) { + if (window_size < 0.0) { throw std::invalid_argument( - std::string(function) + ": window_ratio must be >= 0 (0 selects the " - "default), got " + std::to_string(window_ratio) + std::string(function) + ": window_size must be >= 0 (0 selects the " + "default), got " + std::to_string(window_size) ); } } @@ -476,21 +476,21 @@ void bind_filters(nb::module_ &m) { m.def( "_gaussian_smoothing_2d_float32", &gaussian_smoothing_2d, nb::arg("image"), nb::arg("sigma_y"), nb::arg("sigma_x"), - nb::arg("window_ratio") = 0.0, + nb::arg("window_size") = 0.0, "2D Gaussian smoothing on a float32 (ny, nx) image with anisotropic sigma." ); m.def( "_gaussian_smoothing_3d_float32", &gaussian_smoothing_3d, nb::arg("image"), nb::arg("sigma_z"), nb::arg("sigma_y"), nb::arg("sigma_x"), - nb::arg("window_ratio") = 0.0, + nb::arg("window_size") = 0.0, "3D Gaussian smoothing on a float32 (nz, ny, nx) image with anisotropic sigma." ); m.def( "_gaussian_derivative_2d_float32", &gaussian_derivative_2d, nb::arg("image"), nb::arg("sigma_y"), nb::arg("sigma_x"), nb::arg("order_y"), nb::arg("order_x"), - nb::arg("window_ratio") = 0.0, + nb::arg("window_size") = 0.0, "2D Gaussian derivative on a float32 (ny, nx) image with per-axis order." ); m.def( @@ -498,39 +498,39 @@ void bind_filters(nb::module_ &m) { nb::arg("image"), nb::arg("sigma_z"), nb::arg("sigma_y"), nb::arg("sigma_x"), nb::arg("order_z"), nb::arg("order_y"), nb::arg("order_x"), - nb::arg("window_ratio") = 0.0, + nb::arg("window_size") = 0.0, "3D Gaussian derivative on a float32 (nz, ny, nx) image with per-axis order." ); m.def( "_gaussian_gradient_magnitude_2d_float32", &gaussian_gradient_magnitude_2d, nb::arg("image"), nb::arg("sigma_y"), nb::arg("sigma_x"), - nb::arg("window_ratio") = 0.0, + nb::arg("window_size") = 0.0, "Gradient magnitude of a Gaussian-smoothed 2D float32 image." ); m.def( "_gaussian_gradient_magnitude_3d_float32", &gaussian_gradient_magnitude_3d, nb::arg("image"), nb::arg("sigma_z"), nb::arg("sigma_y"), nb::arg("sigma_x"), - nb::arg("window_ratio") = 0.0, + nb::arg("window_size") = 0.0, "Gradient magnitude of a Gaussian-smoothed 3D float32 image." ); m.def( "_laplacian_of_gaussian_2d_float32", &laplacian_of_gaussian_2d, nb::arg("image"), nb::arg("sigma_y"), nb::arg("sigma_x"), - nb::arg("window_ratio") = 0.0, + nb::arg("window_size") = 0.0, "Laplacian of Gaussian on a 2D float32 image." ); m.def( "_laplacian_of_gaussian_3d_float32", &laplacian_of_gaussian_3d, nb::arg("image"), nb::arg("sigma_z"), nb::arg("sigma_y"), nb::arg("sigma_x"), - nb::arg("window_ratio") = 0.0, + nb::arg("window_size") = 0.0, "Laplacian of Gaussian on a 3D float32 image." ); m.def( "_hessian_of_gaussian_eigenvalues_2d_float32", &hessian_of_gaussian_eigenvalues_2d, nb::arg("image"), nb::arg("sigma_y"), nb::arg("sigma_x"), - nb::arg("window_ratio") = 0.0, + nb::arg("window_size") = 0.0, "Eigenvalues of the Hessian of Gaussian on a 2D float32 image. " "Output shape: (ny, nx, 2), sorted descending along the trailing axis." ); @@ -538,7 +538,7 @@ void bind_filters(nb::module_ &m) { "_hessian_of_gaussian_eigenvalues_3d_float32", &hessian_of_gaussian_eigenvalues_3d, nb::arg("image"), nb::arg("sigma_z"), nb::arg("sigma_y"), nb::arg("sigma_x"), - nb::arg("window_ratio") = 0.0, + nb::arg("window_size") = 0.0, "Eigenvalues of the Hessian of Gaussian on a 3D float32 image. " "Output shape: (nz, ny, nx, 3), sorted descending along the trailing axis." ); @@ -547,7 +547,7 @@ void bind_filters(nb::module_ &m) { nb::arg("image"), nb::arg("sigma_inner_y"), nb::arg("sigma_inner_x"), nb::arg("sigma_outer_y"), nb::arg("sigma_outer_x"), - nb::arg("window_ratio") = 0.0, + nb::arg("window_size") = 0.0, "Eigenvalues of the structure tensor on a 2D float32 image. " "Output shape: (ny, nx, 2), sorted descending along the trailing axis." ); @@ -556,7 +556,7 @@ void bind_filters(nb::module_ &m) { nb::arg("image"), nb::arg("sigma_inner_z"), nb::arg("sigma_inner_y"), nb::arg("sigma_inner_x"), nb::arg("sigma_outer_z"), nb::arg("sigma_outer_y"), nb::arg("sigma_outer_x"), - nb::arg("window_ratio") = 0.0, + nb::arg("window_size") = 0.0, "Eigenvalues of the structure tensor on a 3D float32 image. " "Output shape: (nz, ny, nx, 3), sorted descending along the trailing axis." ); diff --git a/src/bindings/segmentation.cxx b/src/bindings/segmentation.cxx index 3b8314b..39891bd 100644 --- a/src/bindings/segmentation.cxx +++ b/src/bindings/segmentation.cxx @@ -392,11 +392,7 @@ nb::tuple relabel_sequential_t( inverse_owner ); - return nb::make_tuple( - nb::cast(relabeled_array), - nb::cast(forward_array), - nb::cast(inverse_array) - ); + return nb::make_tuple(relabeled_array, forward_array, inverse_array); } template From 0f0c5407b85ae2c4f55fb8094075a068085e911a Mon Sep 17 00:00:00 2001 From: Constantin Pape Date: Fri, 12 Jun 2026 07:37:01 +0200 Subject: [PATCH 6/7] Revert two perf regressions found by A/B benchmarking; trim a third A/B benchmarks (pre-review build vs the review branch) showed two of the review's changes were net-negative, so they are reverted: I4 (label accumulation): the combined-key (node, other) histogram was 3-15x SLOWER than the per-node map-of-maps and scaled negatively with threads (a giant hashmap probed once per pixel, plus a serial argmax). Restore the per-node map-of-maps with the parallel merge. The detail::number_of_elements dedup (R1) is kept. Tests confirm identical output. B1 (mutex watershed): routing the neighbor through detail::valid_offset_target (per-axis bounds check) cost ~17% on 3D in the hot loop. The public API already guarantees valid_edges excludes out-of-bounds / wrapping edges, so restore the single precomputed offset-stride add and document that precondition instead. B2 (UnionFind bindings): keep the safety bounds check but replace the per-element validation loop with a single std::max_element scan, cutting bulk find overhead from ~1.35x to ~1.1x. The residual cost is the unavoidable extra read of the id array; it only affects the non-hot Python convenience API. BFS (I3) stays 4.5x faster; greedy-additive (I1) unchanged. Co-Authored-By: Claude Fable 5 --- .../bioimage_cpp/graph/label_accumulation.hxx | 97 ++++++++----------- .../segmentation/mutex_watershed.hxx | 21 ++-- .../segmentation/semantic_mutex_watershed.hxx | 20 ++-- src/bindings/util.cxx | 25 +++-- 4 files changed, 86 insertions(+), 77 deletions(-) diff --git a/include/bioimage_cpp/graph/label_accumulation.hxx b/include/bioimage_cpp/graph/label_accumulation.hxx index d0ed3d4..1f2376f 100644 --- a/include/bioimage_cpp/graph/label_accumulation.hxx +++ b/include/bioimage_cpp/graph/label_accumulation.hxx @@ -17,32 +17,6 @@ namespace bioimage_cpp::graph { namespace detail_label_accumulation { -// Combined (node, other) histogram key. A single map per thread keyed by this -// pair avoids the n_threads * n_nodes map-of-maps (one std::unordered_map per -// node per thread), which dominates allocation for RAGs with many nodes. -template -struct NodeOther { - std::uint64_t node; - OtherT other; - bool operator==(const NodeOther &rhs) const noexcept { - return node == rhs.node && other == rhs.other; - } -}; - -template -struct NodeOtherHash { - std::size_t operator()(const NodeOther &key) const noexcept { - std::uint64_t h = key.node * 0x9E3779B97F4A7C15ULL; - h ^= static_cast(key.other) + 0x9E3779B97F4A7C15ULL + - (h << 6) + (h >> 2); - return static_cast(h); - } -}; - -template -using NodeOtherHistogram = - std::unordered_map, std::uint64_t, NodeOtherHash>; - template void scan_chunk( const LabelT *labels, @@ -51,7 +25,7 @@ void scan_chunk( const std::size_t pixel_end, const bool has_ignore_value, const OtherT ignore_value, - NodeOtherHistogram &histogram + std::vector> &histograms ) { for (std::size_t index = pixel_begin; index < pixel_end; ++index) { const auto other = other_labels[index]; @@ -59,7 +33,7 @@ void scan_chunk( continue; } const auto node = detail::checked_label_to_node(labels[index]); - ++histogram[NodeOther{static_cast(node), other}]; + ++histograms[static_cast(node)][other]; } } @@ -98,7 +72,10 @@ void accumulate_labels( const auto n_pixels = bioimage_cpp::detail::number_of_elements(labels.shape); const auto n_threads = detail::normalize_thread_count(number_of_threads, n_pixels); - std::vector> per_thread(n_threads); + std::vector>> per_thread( + n_threads, + std::vector>(number_of_nodes) + ); bioimage_cpp::detail::parallel_for_chunks( n_threads, @@ -116,35 +93,41 @@ void accumulate_labels( } ); - // Merge the per-thread histograms into one, then pick the majority `other` - // label per node (smaller label wins ties; nodes with no pixels stay 0). - // The argmax pass is single-threaded over distinct (node, other) pairs, - // which is cheap relative to the parallel pixel scan above. - auto &merged = per_thread[0]; - for (std::size_t thread_id = 1; thread_id < per_thread.size(); ++thread_id) { - for (const auto &entry : per_thread[thread_id]) { - merged[entry.first] += entry.second; - } - detail_label_accumulation::NodeOtherHistogram().swap(per_thread[thread_id]); - } - - for (std::size_t node = 0; node < number_of_nodes; ++node) { - out.data[node] = OtherT{0}; - } - std::vector best_count(number_of_nodes, 0); - std::vector has_best(number_of_nodes, 0); - for (const auto &entry : merged) { - const auto node = static_cast(entry.first.node); - const auto other = entry.first.other; - const auto count = entry.second; - if (has_best[node] == 0 || - count > best_count[node] || - (count == best_count[node] && other < out.data[node])) { - out.data[node] = other; - best_count[node] = count; - has_best[node] = 1; + // Merge per-thread histograms and pick majority per node in one pass over + // nodes. This is embarrassingly parallel across nodes. (A single combined + // (node, other) hashmap per thread was tried and was 3-15x slower — a giant + // map probed per pixel plus a serial argmax — so the per-node map-of-maps + // with a parallel merge stays.) + const auto node_threads = detail::normalize_thread_count(number_of_threads, number_of_nodes); + bioimage_cpp::detail::parallel_for_chunks( + node_threads, + number_of_nodes, + [&](const std::size_t, const std::size_t node_begin, const std::size_t node_end) { + for (std::size_t node = node_begin; node < node_end; ++node) { + std::unordered_map merged; + for (auto &thread_histograms : per_thread) { + auto &node_hist = thread_histograms[node]; + for (const auto &entry : node_hist) { + merged[entry.first] += entry.second; + } + node_hist.clear(); + } + OtherT best_label = OtherT{0}; + std::uint64_t best_count = 0; + bool has_best = false; + for (const auto &entry : merged) { + if (!has_best || + entry.second > best_count || + (entry.second == best_count && entry.first < best_label)) { + best_label = entry.first; + best_count = entry.second; + has_best = true; + } + } + out.data[node] = has_best ? best_label : OtherT{0}; + } } - } + ); } } // namespace bioimage_cpp::graph diff --git a/include/bioimage_cpp/segmentation/mutex_watershed.hxx b/include/bioimage_cpp/segmentation/mutex_watershed.hxx index e75c645..ec84887 100644 --- a/include/bioimage_cpp/segmentation/mutex_watershed.hxx +++ b/include/bioimage_cpp/segmentation/mutex_watershed.hxx @@ -78,6 +78,18 @@ void mutex_watershed_grid( )); const auto spatial_strides = detail::c_order_strides(spatial_shape); + // Precondition: `valid_edges` must be 0 for every out-of-bounds / boundary- + // wrapping edge (the Python wrapper guarantees this). Given that, the + // neighbor flat index can be computed with a single precomputed stride add + // per edge instead of a per-axis bounds check, which matters in this hot + // loop (a per-axis valid_offset_target check measured ~17% slower on 3D). + std::vector offset_strides(number_of_channels, 0); + for (std::size_t channel = 0; channel < number_of_channels; ++channel) { + for (std::size_t axis = 0; axis < spatial_ndim; ++axis) { + offset_strides[channel] += offsets[channel][axis] * spatial_strides[axis]; + } + } + const auto number_of_edges = number_of_nodes * number_of_channels; std::vector> edge_order; edge_order.reserve(static_cast(number_of_edges)); @@ -101,13 +113,8 @@ void mutex_watershed_grid( const auto channel = static_cast(edge_id / number_of_nodes); const auto u = edge_id % number_of_nodes; - // Bounds-check the neighbor per axis rather than relying solely on the - // valid_edges mask: this keeps the kernel memory-safe and rejects edges - // that would wrap across a row/plane boundary. - std::uint64_t v = 0; - if (!detail::valid_offset_target(u, offsets[channel], spatial_shape, spatial_strides, v)) { - continue; - } + const auto v_signed = static_cast(u) + static_cast(offset_strides[channel]); + const auto v = static_cast(v_signed); std::uint64_t root_u = sets.find(u); std::uint64_t root_v = sets.find(v); if (root_u == root_v) { diff --git a/include/bioimage_cpp/segmentation/semantic_mutex_watershed.hxx b/include/bioimage_cpp/segmentation/semantic_mutex_watershed.hxx index 5c10f41..b17a963 100644 --- a/include/bioimage_cpp/segmentation/semantic_mutex_watershed.hxx +++ b/include/bioimage_cpp/segmentation/semantic_mutex_watershed.hxx @@ -105,6 +105,17 @@ void semantic_mutex_watershed_grid( )); const auto spatial_strides = detail::c_order_strides(spatial_shape); + // Precondition: `valid_edges` is 0 for every out-of-bounds / boundary- + // wrapping spatial edge (guaranteed by the Python wrapper), so the neighbor + // flat index uses a single precomputed stride add per edge rather than a + // per-axis bounds check in this hot loop. + std::vector offset_strides(number_of_offsets, 0); + for (std::size_t channel = 0; channel < number_of_offsets; ++channel) { + for (std::size_t axis = 0; axis < spatial_ndim; ++axis) { + offset_strides[channel] += offsets[channel][axis] * spatial_strides[axis]; + } + } + struct WeightedGridEdge { T weight; std::uint64_t id; @@ -148,12 +159,9 @@ void semantic_mutex_watershed_grid( continue; } - // Bounds-check the neighbor per axis (see mutex_watershed_grid): keeps - // the kernel memory-safe and rejects edges that wrap across a boundary. - std::uint64_t v = 0; - if (!detail::valid_offset_target(u, offsets[channel], spatial_shape, spatial_strides, v)) { - continue; - } + const auto v_signed = static_cast(u) + + static_cast(offset_strides[channel]); + const auto v = static_cast(v_signed); const auto root_u = sets.find(u); const auto root_v = sets.find(v); if (root_u == root_v) { diff --git a/src/bindings/util.cxx b/src/bindings/util.cxx index 519c87c..7edce98 100644 --- a/src/bindings/util.cxx +++ b/src/bindings/util.cxx @@ -4,6 +4,7 @@ #include +#include #include #include #include @@ -30,6 +31,21 @@ void check_node(const util::UnionFind &uf, const std::uint64_t node, const char } } +// Bulk variant: a single max-scan over the ids (vectorizable, one pass) instead +// of a per-element branch, so validating a large id array stays cheap. +void check_nodes(const util::UnionFind &uf, const std::uint64_t *data, std::size_t count, const char *name) { + if (count == 0) { + return; + } + const auto max_node = *std::max_element(data, data + count); + if (max_node >= uf.size()) { + throw std::invalid_argument( + std::string(name) + " out of range: got " + name + "=" + + std::to_string(max_node) + ", size=" + std::to_string(uf.size()) + ); + } +} + void merge_edges( util::UnionFind &uf, EdgeArray edges @@ -51,10 +67,7 @@ void merge_edges( const auto n_edges = edges.shape(0); const auto *data = edges.data(); - for (std::size_t i = 0; i < n_edges; ++i) { - check_node(uf, data[2 * i], "edge endpoint"); - check_node(uf, data[2 * i + 1], "edge endpoint"); - } + check_nodes(uf, data, 2 * n_edges, "edge endpoint"); { nb::gil_scoped_release release; @@ -67,9 +80,7 @@ void merge_edges( OutputArray find_nodes(util::UnionFind &uf, NodeArray nodes) { const auto n = nodes.shape(0); const auto *input = nodes.data(); - for (std::size_t i = 0; i < n; ++i) { - check_node(uf, input[i], "node"); - } + check_nodes(uf, input, n, "node"); auto *out = new std::uint64_t[n](); nb::capsule owner(out, [](void *p) noexcept { delete[] static_cast(p); }); From 66ed315e53ecef199a10346c3649867941c8c24f Mon Sep 17 00:00:00 2001 From: Constantin Pape Date: Fri, 12 Jun 2026 07:37:01 +0200 Subject: [PATCH 7/7] Add bic-only timing benchmarks for union-find, accumulate-labels, BFS, mutex-ws Standalone (no nifty/affogato) benchmarks for the areas touched by the code review that lacked timing coverage, used for the A/B perf validation: - utils/benchmark_union_find.py: bulk merge/find (B2 bounds-check cost) - graph/benchmark_accumulate_labels.py: accumulate_labels across 1/4/8 threads - graph/benchmark_bfs.py: lifted_edges_from_node_labels (workspace-reused BFS, I3) - segmentation/benchmark_mutex_watershed.py: mutex_watershed grid kernel (B1) Each follows the warmup + median/min pattern and exposes run() for a driver. Co-Authored-By: Claude Fable 5 --- .../graph/benchmark_accumulate_labels.py | 82 +++++++++++++++++ development/graph/benchmark_bfs.py | 70 +++++++++++++++ .../segmentation/benchmark_mutex_watershed.py | 89 +++++++++++++++++++ development/utils/benchmark_union_find.py | 70 +++++++++++++++ 4 files changed, 311 insertions(+) create mode 100644 development/graph/benchmark_accumulate_labels.py create mode 100644 development/graph/benchmark_bfs.py create mode 100644 development/segmentation/benchmark_mutex_watershed.py create mode 100644 development/utils/benchmark_union_find.py diff --git a/development/graph/benchmark_accumulate_labels.py b/development/graph/benchmark_accumulate_labels.py new file mode 100644 index 0000000..6327ed7 --- /dev/null +++ b/development/graph/benchmark_accumulate_labels.py @@ -0,0 +1,82 @@ +"""Benchmark RAG label accumulation (``accumulate_labels``). + +The code review replaced the per-thread ``n_threads x n_nodes`` map-of-maps with +a single combined-key ``(node, other)`` histogram per thread, and made the +per-node argmax single-threaded. This script times ``accumulate_labels`` across +thread counts (to confirm the allocation win and rule out a high-thread-count +regression from the now-sequential argmax) and across ``other``-label +cardinalities. + +Inputs: a deterministic block-wise over-segmentation (each node is a contiguous +block, as in a real over-segmentation), so the RAG adjacency is local. + +Not part of the test suite. Run:: + + python development/graph/benchmark_accumulate_labels.py --repeats 11 +""" +from __future__ import annotations + +import argparse +from statistics import median +from time import perf_counter + +import numpy as np + +import bioimage_cpp as bic + + +def _timeit(fn, repeats: int, warmup: int = 1) -> dict: + for _ in range(warmup): + fn() + timings = [] + for _ in range(repeats): + t0 = perf_counter() + fn() + timings.append(perf_counter() - t0) + return {"median": median(timings), "min": min(timings), "n": repeats} + + +def _make_labels(shape=(40, 256, 256), coarsen=(2, 4, 4)) -> np.ndarray: + coarse_shape = tuple(s // c for s, c in zip(shape, coarsen)) + n_nodes = int(np.prod(coarse_shape)) + coarse = np.arange(n_nodes, dtype=np.uint64).reshape(coarse_shape) + block = np.ones(coarsen, dtype=np.uint64) + return np.ascontiguousarray(np.kron(coarse, block)) + + +def run(repeats: int = 11) -> dict: + labels = _make_labels() + rag = bic.graph.region_adjacency_graph(labels) + n_nodes = int(rag.number_of_nodes) + rng = np.random.default_rng(0) + + results: dict[str, dict] = {} + for n_other, tag in ((8, "dense8"), (2000, "sparse2000")): + other = rng.integers(0, n_other, size=labels.shape).astype(np.uint64) + other = np.ascontiguousarray(other) + for threads in (1, 4, 8): + def fn(t=threads, o=other): + bic.graph.features.accumulate_labels(rag, labels, o, number_of_threads=t) + + key = f"accumulate_labels_{tag}_t{threads}" + results[key] = _timeit(fn, repeats) + results[key]["meta"] = { + "n_nodes": n_nodes, + "n_pixels": int(labels.size), + "n_other": n_other, + "threads": threads, + } + return results + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--repeats", type=int, default=11) + args = parser.parse_args() + results = run(repeats=args.repeats) + for name, r in results.items(): + print(f"{name:<32} median={r['median'] * 1e3:8.3f} ms min={r['min'] * 1e3:8.3f} ms") + + +if __name__ == "__main__": + main() diff --git a/development/graph/benchmark_bfs.py b/development/graph/benchmark_bfs.py new file mode 100644 index 0000000..02922e6 --- /dev/null +++ b/development/graph/benchmark_bfs.py @@ -0,0 +1,70 @@ +"""Benchmark the workspace-reused BFS path (``lifted_edges_from_node_labels``). + +The code review changed ``BfsWorkspace::reset`` from an O(N) clear of the +visited/distance buffers to an O(1) generation-stamp bump. That only pays off +when one workspace is reset across many sources, which is exactly what +``lifted_edges_from_node_labels`` does (one workspace per chunk, reset per +source). The single-call ``breadth_first_search`` builds a fresh workspace each +call and would NOT show the change, so we benchmark the lifted-edge path. + +Single-threaded so the per-source reset cost is not hidden by parallelism. The +node count is swept to expose the previous O(N^2)-of-memset behavior. + +Not part of the test suite. Run:: + + python development/graph/benchmark_bfs.py --repeats 5 +""" +from __future__ import annotations + +import argparse +from statistics import median +from time import perf_counter + +import numpy as np + +import bioimage_cpp as bic + + +def _timeit(fn, repeats: int, warmup: int = 1) -> dict: + for _ in range(warmup): + fn() + timings = [] + for _ in range(repeats): + t0 = perf_counter() + fn() + timings.append(perf_counter() - t0) + return {"median": median(timings), "min": min(timings), "n": repeats} + + +def run(repeats: int = 5, depth: int = 2) -> dict: + shapes = [(100, 100), (160, 160), (220, 220)] + results: dict[str, dict] = {} + for shape in shapes: + n_nodes = int(np.prod(shape)) + graph = bic.graph.grid_graph(shape) + rng = np.random.default_rng(0) + node_labels = rng.integers(0, 50, size=(n_nodes,), dtype=np.uint64) + + def fn(g=graph, nl=node_labels, d=depth): + bic.graph.lifted_multicut.lifted_edges_from_node_labels( + g, nl, graph_depth=d, number_of_threads=1 + ) + + key = f"bfs_lifted_{n_nodes}nodes_d{depth}" + results[key] = _timeit(fn, repeats) + results[key]["meta"] = {"n_nodes": n_nodes, "shape": shape, "depth": depth} + return results + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--repeats", type=int, default=5) + parser.add_argument("--depth", type=int, default=2) + args = parser.parse_args() + results = run(repeats=args.repeats, depth=args.depth) + for name, r in results.items(): + print(f"{name:<28} median={r['median'] * 1e3:9.3f} ms min={r['min'] * 1e3:9.3f} ms") + + +if __name__ == "__main__": + main() diff --git a/development/segmentation/benchmark_mutex_watershed.py b/development/segmentation/benchmark_mutex_watershed.py new file mode 100644 index 0000000..d77abc4 --- /dev/null +++ b/development/segmentation/benchmark_mutex_watershed.py @@ -0,0 +1,89 @@ +"""Benchmark the mutex-watershed grid kernel (bic only). + +The code review routed the neighbor computation in ``mutex_watershed_grid`` +through ``detail::valid_offset_target`` (per-axis bounds check) instead of a +single precomputed flat-offset add. This script times ``mutex_watershed`` on the +ISBI affinities for 2D and 3D so we can A/B that inner-loop change against a +pre-review build. No external (affogato) dependency — bic only. + +Not part of the test suite. Run:: + + python development/segmentation/benchmark_mutex_watershed.py --repeats 5 +""" +from __future__ import annotations + +import argparse +from statistics import median +from time import perf_counter + +import numpy as np + +import bioimage_cpp as bic +from bioimage_cpp._data import load_isbi_affinities + + +def _timeit(fn, repeats: int, warmup: int = 1) -> dict: + for _ in range(warmup): + fn() + timings = [] + for _ in range(repeats): + t0 = perf_counter() + fn() + timings.append(perf_counter() - t0) + return {"median": median(timings), "min": min(timings), "n": repeats} + + +def _attractive_flip(affs: np.ndarray, n_attractive: int) -> np.ndarray: + # Match the convention in the equivalence checker: attractive channels are + # turned into merge affinities (1 - aff). + out = affs.copy() + out[:n_attractive] *= -1 + out[:n_attractive] += 1 + return out + + +def run(repeats: int = 5) -> dict: + affinities, offsets = load_isbi_affinities() + affinities = np.ascontiguousarray(affinities) + offsets = [tuple(o) for o in offsets] + results: dict[str, dict] = {} + + # --- 2D: in-plane channels of a single z slice --- + channels_2d = [i for i, o in enumerate(offsets) if o[0] == 0] + aff2d = np.ascontiguousarray(affinities[channels_2d, 0, :256, :256]) + offsets_2d = [offsets[i][1:] for i in channels_2d] + aff2d_flipped = _attractive_flip(aff2d, 2) + + def mws_2d(): + bic.segmentation.mutex_watershed( + aff2d_flipped, offsets_2d, number_of_attractive_channels=2 + ) + + results["mws_2d_256x256"] = _timeit(mws_2d, repeats) + results["mws_2d_256x256"]["meta"] = {"shape": list(aff2d.shape)} + + # --- 3D: small crop, all offsets --- + aff3d = np.ascontiguousarray(affinities[:, :6, :256, :256]) + aff3d_flipped = _attractive_flip(aff3d, 3) + + def mws_3d(): + bic.segmentation.mutex_watershed( + aff3d_flipped, offsets, number_of_attractive_channels=3 + ) + + results["mws_3d_6x256x256"] = _timeit(mws_3d, repeats) + results["mws_3d_6x256x256"]["meta"] = {"shape": list(aff3d.shape)} + return results + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--repeats", type=int, default=5) + args = parser.parse_args() + results = run(repeats=args.repeats) + for name, r in results.items(): + print(f"{name:<20} median={r['median'] * 1e3:9.3f} ms min={r['min'] * 1e3:9.3f} ms") + + +if __name__ == "__main__": + main() diff --git a/development/utils/benchmark_union_find.py b/development/utils/benchmark_union_find.py new file mode 100644 index 0000000..8ed518e --- /dev/null +++ b/development/utils/benchmark_union_find.py @@ -0,0 +1,70 @@ +"""Micro-benchmark for the UnionFind Python bindings. + +Times the bulk ``merge((N, 2) edges)`` and ``find(node_array)`` entry points, +which the code review changed to validate every node id against ``uf.size`` +before touching the C++ structure (an extra O(N) pre-pass). This script lets us +A/B that pre-pass against a pre-review build. + +Not part of the test suite. Run:: + + python development/utils/benchmark_union_find.py --repeats 11 +""" +from __future__ import annotations + +import argparse +from statistics import median +from time import perf_counter + +import numpy as np + +import bioimage_cpp as bic + + +def _timeit(fn, repeats: int, warmup: int = 1) -> dict: + for _ in range(warmup): + fn() + timings = [] + for _ in range(repeats): + t0 = perf_counter() + fn() + timings.append(perf_counter() - t0) + return {"median": median(timings), "min": min(timings), "n": repeats} + + +def run(repeats: int = 11, n_nodes: int = 1_000_000, n_edges: int = 2_000_000) -> dict: + rng = np.random.default_rng(0) + edges = rng.integers(0, n_nodes, size=(n_edges, 2), dtype=np.uint64) + query = rng.integers(0, n_nodes, size=(n_edges,), dtype=np.uint64) + + # Pre-merged structure for the find benchmark (find does not mutate the + # partition, so it can be reused across repeats). + merged = bic.utils.UnionFind(n_nodes) + merged.merge(edges) + + def bulk_merge(): + uf = bic.utils.UnionFind(n_nodes) + uf.merge(edges) + + def bulk_find(): + merged.find(query) + + results = { + "uf_bulk_merge": _timeit(bulk_merge, repeats), + "uf_bulk_find": _timeit(bulk_find, repeats), + } + for r in results.values(): + r["meta"] = {"n_nodes": n_nodes, "n_edges": n_edges} + return results + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--repeats", type=int, default=11) + args = parser.parse_args() + results = run(repeats=args.repeats) + for name, r in results.items(): + print(f"{name:<16} median={r['median'] * 1e3:8.3f} ms min={r['min'] * 1e3:8.3f} ms") + + +if __name__ == "__main__": + main()