diff --git a/crates/ppvm-python-native/src/interface_tableau.rs b/crates/ppvm-python-native/src/interface_tableau.rs
index 81c15561..a68b145d 100644
--- a/crates/ppvm-python-native/src/interface_tableau.rs
+++ b/crates/ppvm-python-native/src/interface_tableau.rs
@@ -169,6 +169,10 @@ macro_rules! create_interface {
                 self.cz(targets)
             }
 
+            pub fn cz_block(&mut self, control_base: usize, target_base: usize, count: usize) {
+                self.inner.cz_block(control_base, target_base, count);
+            }
+
             // rot1
             pub fn rx(&mut self, targets: Vec<usize>, theta: f64) {
                 self.inner.rx_many(targets.as_slice(), theta);
diff --git a/crates/ppvm-tableau/benches/tableau-msd-fused.rs b/crates/ppvm-tableau/benches/tableau-msd-fused.rs
index b4df03ee..46a3d3df 100644
--- a/crates/ppvm-tableau/benches/tableau-msd-fused.rs
+++ b/crates/ppvm-tableau/benches/tableau-msd-fused.rs
@@ -35,36 +35,32 @@ fn msd_func_fused<const MEASURE: bool>() -> (String, Tab) {
     tab.sqrt_x_many(ql[1]);
     tab.sqrt_x_many(ql[4]);
 
-    // ql[0] x ql[1]: pairs (0,17)...(16,33) — all in word 0
-    tab.cz_block_pairs(0, 17, 17);
+    // Cross-block CZ layers entangle two contiguous registers with a constant
+    // offset. `cz_block` takes plain qubit indices (control_base, target_base,
+    // count) and splits each run at u64-word boundaries internally, so it emits
+    // the same within-word / cross-word kernels the hand-written calls did.
+    let block_len = qubits_per_code_block;
 
-    // ql[2] x ql[3]: pairs (34,51)...(50,67)
-    tab.cz_block_pairs(34, 17, 13); // (34,51)...(46,63) in word 0
-    // (47,64)...(50,67): controls word 0 bits 47-50, targets word 1 bits 0-3
-    tab.cz_block_pairs_cross_word(0, 47, 1, 0, 4);
+    // ql[0] x ql[1]
+    tab.cz_block(ql[0][0], ql[1][0], block_len);
+    // ql[2] x ql[3]
+    tab.cz_block(ql[2][0], ql[3][0], block_len);
 
     // sqrt_y on ql[0] and ql[3]
     tab.sqrt_y_many(ql[0]);
     tab.sqrt_y_many(ql[3]);
 
-    // ql[0] x ql[2]: pairs (0,34)...(16,50) — all in word 0
-    tab.cz_block_pairs(0, 34, 17);
-
-    // ql[3] x ql[4]: (51,68)...(67,84)
-    // (51,68)...(63,80): controls word 0 bits 51-63, targets word 1 bits 4-16
-    tab.cz_block_pairs_cross_word(0, 51, 1, 4, 13);
-    tab.cz_block_pairs(64, 17, 4); // (64,81)...(67,84) both in word 1
+    // ql[0] x ql[2]
+    tab.cz_block(ql[0][0], ql[2][0], block_len);
+    // ql[3] x ql[4]
+    tab.cz_block(ql[3][0], ql[4][0], block_len);
 
     tab.sqrt_x_dag_many(ql[0]);
 
-    // ql[0] x ql[4]: (0,68)...(16,84)
-    // controls word 0 bits 0-16, targets word 1 bits 4-20
-    tab.cz_block_pairs_cross_word(0, 0, 1, 4, 17);
-
-    // ql[1] x ql[3]: (17,51)...(33,67)
-    tab.cz_block_pairs(17, 34, 13); // (17,51)...(29,63) in word 0
-    // (30,64)...(33,67): controls word 0 bits 30-33, targets word 1 bits 0-3
-    tab.cz_block_pairs_cross_word(0, 30, 1, 0, 4);
+    // ql[0] x ql[4]
+    tab.cz_block(ql[0][0], ql[4][0], block_len);
+    // ql[1] x ql[3]
+    tab.cz_block(ql[1][0], ql[3][0], block_len);
 
     // sqrt_x_dag on all blocks
     for block in ql.iter().take(5) {
diff --git a/crates/ppvm-tableau/src/data.rs b/crates/ppvm-tableau/src/data.rs
index d35e3e70..64f39708 100644
--- a/crates/ppvm-tableau/src/data.rs
+++ b/crates/ppvm-tableau/src/data.rs
@@ -771,6 +771,47 @@ where
         }
     }
 
+    /// Apply CZ to `count` pairs with a constant offset, given in qubit-index
+    /// terms: `(control_base + i, target_base + i)` for `i in 0..count`.
+    ///
+    /// This is the high-level entry point for a fused block of CZs: it splits
+    /// the run at storage-word boundaries internally and dispatches each
+    /// segment to [`Self::cz_block_pairs`] (control and target in the same
+    /// word) or [`Self::cz_block_pairs_cross_word`] (straddling two words), so
+    /// callers never need to reason about the `u64` packing. CZ is symmetric,
+    /// so the two bases may be passed in either order.
+    pub fn cz_block(&mut self, control_base: usize, target_base: usize, count: usize)
+    where
+        <<T::Storage as BitView>::Store as TryFrom<usize>>::Error: Debug,
+        <T::Storage as BitView>::Store: PrimInt + TryFrom<usize>,
+    {
+        if count == 0 {
+            return;
+        }
+        // cz_block_pairs needs a non-negative offset; CZ is symmetric, so order
+        // the two bases.
+        let (lo, hi) = if control_base <= target_base {
+            (control_base, target_base)
+        } else {
+            (target_base, control_base)
+        };
+        let bits_per_word = std::mem::size_of::<<T::Storage as BitView>::Store>() * 8;
+        let mut i = 0;
+        while i < count {
+            let (c, t) = (lo + i, hi + i);
+            let (wc, bc) = (c / bits_per_word, c % bits_per_word);
+            let (wt, bt) = (t / bits_per_word, t % bits_per_word);
+            // Longest run before either index crosses into the next word.
+            let run = (bits_per_word - bc).min(bits_per_word - bt).min(count - i);
+            if wc == wt {
+                self.cz_block_pairs(c, t - c, run);
+            } else {
+                self.cz_block_pairs_cross_word(wc, bc, wt, bt, run);
+            }
+            i += run;
+        }
+    }
+
     // helper functions
 
     /// Compute the decomposition of a pauli into stabilizer destabilizer products
@@ -1415,4 +1456,42 @@ mod tests {
             snapshot_tableau(&tab2.tableau)
         );
     }
+
+    #[test]
+    fn test_cz_block_matches_individual_across_word_boundary() {
+        // cz_block must split a run that straddles the u64 boundary into the
+        // right within-word + cross-word segments. control_base=34,
+        // target_base=51, count=17 reproduces the MSD ql[2]xql[3] sweep:
+        // (34,51)..(46,63) in word 0, then (47,64)..(50,67) cross-word.
+        use ppvm_pauli_sum::config::fx64hash::Byte8F64;
+        type GTab = GeneralizedTableau<Byte8F64<2>>;
+        let n = 85;
+        let mut tab1: GTab = GeneralizedTableau::new(n, 1e-12);
+        for i in 0..n {
+            Clifford::h(&mut tab1.tableau, i);
+        }
+        let mut tab2 = tab1.clone();
+
+        let (control_base, target_base, count) = (34, 51, 17);
+        for i in 0..count {
+            Clifford::cz(&mut tab1, control_base + i, target_base + i);
+        }
+        tab2.cz_block(control_base, target_base, count);
+
+        assert_eq!(
+            snapshot_tableau(&tab1.tableau),
+            snapshot_tableau(&tab2.tableau)
+        );
+
+        // Reversed bases (CZ is symmetric) must give the same result.
+        let mut tab3 = GeneralizedTableau::<Byte8F64<2>>::new(n, 1e-12);
+        for i in 0..n {
+            Clifford::h(&mut tab3.tableau, i);
+        }
+        tab3.cz_block(target_base, control_base, count);
+        assert_eq!(
+            snapshot_tableau(&tab1.tableau),
+            snapshot_tableau(&tab3.tableau)
+        );
+    }
 }
diff --git a/ppvm-python/src/ppvm/_core.pyi b/ppvm-python/src/ppvm/_core.pyi
index c4a91ca6..d796ea55 100644
--- a/ppvm-python/src/ppvm/_core.pyi
+++ b/ppvm-python/src/ppvm/_core.pyi
@@ -136,6 +136,7 @@ class _GeneralizedTableauBase:
     def zcy(self, targets: Sequence[int]) -> None: ...
     def cz(self, targets: Sequence[int]) -> None: ...
     def zcz(self, targets: Sequence[int]) -> None: ...
+    def cz_block(self, control_base: int, target_base: int, count: int) -> None: ...
     def rx(self, targets: Sequence[int], theta: float) -> None: ...
     def ry(self, targets: Sequence[int], theta: float) -> None: ...
     def rz(self, targets: Sequence[int], theta: float) -> None: ...
diff --git a/ppvm-python/src/ppvm/generalized_tableau.py b/ppvm-python/src/ppvm/generalized_tableau.py
index b0e6cbe2..3423232a 100644
--- a/ppvm-python/src/ppvm/generalized_tableau.py
+++ b/ppvm-python/src/ppvm/generalized_tableau.py
@@ -45,6 +45,13 @@ class MeasurementResult(enum.IntEnum):
     LOST = 2
 
 
+# Indexed by integer outcome value (0/1/2) to reuse the singleton enum members.
+# This is much faster than calling ``MeasurementResult(i)`` per element: the
+# IntEnum constructor dominates large readouts, while a tuple index just bumps a
+# refcount. Shared with ``GeneralizedTableauSum``.
+_BY_VALUE = (MeasurementResult.ZERO, MeasurementResult.ONE, MeasurementResult.LOST)
+
+
 @dataclass(frozen=True)
 class GeneralizedTableau(
     CliffordMixin,
@@ -148,6 +155,26 @@ def t_dag(self, *targets: int | Iterable[int]) -> None:
         """
         self._interface.t_dag(_normalize_targets(targets))
 
+    def cz_block(self, control_base: int, target_base: int, count: int) -> None:
+        """Apply a fused block of CZ gates over constant-offset qubit pairs.
+
+        Applies CZ to ``(control_base + i, target_base + i)`` for ``i`` in
+        ``range(count)`` -- i.e. the gates ``zip(range(control_base, ...),
+        range(target_base, ...))`` would produce. This uses a word-level kernel
+        that is much faster than the equivalent `cz` call when the pairs form a
+        contiguous, constant-offset block (e.g. entangling two adjacent qubit
+        registers). For scattered pairs, use `cz`.
+
+        CZ is symmetric, so ``control_base`` and ``target_base`` may be given in
+        either order.
+
+        Args:
+            control_base: First qubit of the control run.
+            target_base: First qubit of the target run.
+            count: Number of CZ pairs.
+        """
+        self._interface.cz_block(control_base, target_base, count)
+
     def measure(self, addr0: int) -> MeasurementResult:
         """Measure the specified qubit in the Z basis.
 
@@ -158,7 +185,7 @@ def measure(self, addr0: int) -> MeasurementResult:
             The measurement outcome as a ``MeasurementResult``, which is
             ``LOST`` if the qubit has been lost, ``ZERO`` or ``ONE`` otherwise.
         """
-        return MeasurementResult(self._interface.measure(addr0))
+        return _BY_VALUE[self._interface.measure(addr0)]
 
     def measure_many(self, *targets: int | Iterable[int]) -> list[MeasurementResult]:
         """Measure several qubits in the Z basis.
@@ -169,9 +196,7 @@ def measure_many(self, *targets: int | Iterable[int]) -> list[MeasurementResult]
         Returns:
             A list of ``MeasurementResult`` outcomes, one per target.
         """
-        return [
-            MeasurementResult(v) for v in self._interface.measure_many(_normalize_targets(targets))
-        ]
+        return [_BY_VALUE[v] for v in self._interface.measure_many(_normalize_targets(targets))]
 
     def current_measurement_record(self) -> list[MeasurementResult]:
         """Return all measurement outcomes recorded so far.
@@ -179,7 +204,7 @@ def current_measurement_record(self) -> list[MeasurementResult]:
         Returns:
             A list of ``MeasurementResult`` outcomes in measurement order.
         """
-        return [MeasurementResult(v) for v in self._interface.current_measurement_record()]
+        return [_BY_VALUE[v] for v in self._interface.current_measurement_record()]
 
     def coefficients(self) -> dict[int, complex]:
         """Return a snapshot of the sparse coefficient vector.
@@ -316,7 +341,7 @@ def run(self, prog: StimProgram) -> list[MeasurementResult]:
             fresh tableau per shot).
         """
         raw = self._interface.run(prog)
-        return [MeasurementResult(x) for x in raw]
+        return [_BY_VALUE[x] for x in raw]
 
     # stim familiarity alias
     do = run
@@ -345,7 +370,7 @@ def sample(
         """
         native_cls = _native_tableau_cls(n_qubits)
         raw = native_cls.sample(prog, n_qubits, min_abs_coeff, num_shots, seed)
-        return [[MeasurementResult(x) for x in shot] for shot in raw]
+        return [[_BY_VALUE[x] for x in shot] for shot in raw]
 
 
 def sample_stim(
diff --git a/ppvm-python/src/ppvm/generalized_tableau_sum.py b/ppvm-python/src/ppvm/generalized_tableau_sum.py
index 6769c06b..f48db4af 100644
--- a/ppvm-python/src/ppvm/generalized_tableau_sum.py
+++ b/ppvm-python/src/ppvm/generalized_tableau_sum.py
@@ -6,7 +6,7 @@
 from typing import cast
 
 from . import _core
-from .generalized_tableau import MeasurementResult
+from .generalized_tableau import _BY_VALUE, MeasurementResult
 from .mixins import (
     CliffordExtensionMixin,
     CliffordMixin,
@@ -17,12 +17,6 @@
 )
 from .types import GeneralizedTableauSumInterface, TableauSumSamplerInterface
 
-# Indexed by integer outcome value (0/1/2) to reuse the singleton enum members.
-# This is much faster than calling ``MeasurementResult(i)`` per element: the
-# IntEnum constructor dominates large shot batches, while a tuple index just
-# bumps a refcount.
-_BY_VALUE = (MeasurementResult.ZERO, MeasurementResult.ONE, MeasurementResult.LOST)
-
 
 @dataclass(frozen=True)
 class GeneralizedTableauSum(
diff --git a/ppvm-python/src/ppvm/mixins.py b/ppvm-python/src/ppvm/mixins.py
index dfa107b2..e5b76d20 100644
--- a/ppvm-python/src/ppvm/mixins.py
+++ b/ppvm-python/src/ppvm/mixins.py
@@ -43,32 +43,49 @@ def _is_sequence(obj: Any) -> bool:
     A bare ``int`` — including a numpy integer scalar, which is not iterable —
     is not a sequence, so it falls through to the variadic path.
     """
-    return isinstance(obj, Iterable) and not isinstance(obj, (str, bytes))
-
-
-def _normalize_targets(args: tuple[Any, ...]) -> list[int]:
+    # Concrete-type fast paths first: ``list``/``tuple`` (the overwhelmingly
+    # common splatted form) and bare ``int`` short-circuit before the slow ABC
+    # ``isinstance(obj, Iterable)`` dispatch, which is run only for the rare
+    # range / ndarray / generator cases. ``str``/``bytes`` are iterable but are
+    # never targets, so they report False.
+    if isinstance(obj, (list, tuple)):
+        return True
+    if isinstance(obj, (int, str, bytes)):
+        return False
+    return isinstance(obj, Iterable)
+
+
+def _normalize_targets(args: tuple[Any, ...]) -> Sequence[int]:
     """Resolve gate targets passed either as variadic indices (``x(0, 1, 2)``)
-    or as a single sequence (``x([0, 1, 2])``, ``x(np.array([0, 1, 2]))``)."""
+    or as a single sequence (``x([0, 1, 2])``, ``x(np.array([0, 1, 2]))``).
+
+    Returns the targets as-is — the single sequence, or the variadic ``args``
+    tuple. The native layer extracts a ``Vec<usize>`` directly (PyO3 handles
+    Python ints, numpy integer scalars, ranges and ndarrays), so there is no
+    need to rebuild the list with a per-element ``int()`` on the hot path."""
     if len(args) == 1 and _is_sequence(args[0]):
-        return [int(t) for t in args[0]]
-    return [int(t) for t in args]
+        return args[0]
+    return args
 
 
 def _split_targets_parameter(
     args: tuple[Any, ...],
     value: Any | None,
     name: str,
-) -> tuple[list[int], Any]:
+) -> tuple[Sequence[int], Any]:
     """Split ``(*targets, value)`` accepting ``value=...`` and a single leading
-    sequence of targets (``([0, 1, 2], theta)`` as well as ``(0, 1, 2, theta)``)."""
+    sequence of targets (``([0, 1, 2], theta)`` as well as ``(0, 1, 2, theta)``).
+
+    Targets are returned as-is (sequence or tuple slice); the native layer does
+    the ``Vec<usize>`` extraction, so no per-element ``int()`` rebuild is needed."""
     if args and _is_sequence(args[0]):
-        targets, rest = [int(t) for t in args[0]], args[1:]
+        targets, rest = args[0], args[1:]
     elif value is None:
         if not args:
             raise TypeError(f"missing required argument: {name!r}")
-        targets, rest = [int(t) for t in args[:-1]], args[-1:]
+        targets, rest = args[:-1], args[-1:]
     else:
-        targets, rest = [int(t) for t in args], ()
+        targets, rest = args, ()
     if value is None:
         if not rest:
             raise TypeError(f"missing required argument: {name!r}")
@@ -81,11 +98,14 @@ def _split_targets_parameter_truncate(
     value: Any | None,
     name: str,
     truncate: bool,
-) -> tuple[list[int], Any, bool]:
+) -> tuple[Sequence[int], Any, bool]:
     """Split ``(*targets, value[, truncate])`` for PauliSum methods, also
-    accepting a single leading sequence of targets."""
+    accepting a single leading sequence of targets.
+
+    Targets are returned as-is (sequence or tuple/list slice); the native layer
+    does the ``Vec<usize>`` extraction, so no per-element ``int()`` is needed."""
     if args and _is_sequence(args[0]):
-        targets = [int(t) for t in args[0]]
+        targets = args[0]
         rest = list(args[1:])
         if rest and isinstance(rest[-1], bool):
             truncate = rest.pop()
@@ -100,8 +120,8 @@ def _split_targets_parameter_truncate(
         args_list = list(args)
         if len(args_list) >= 2 and isinstance(args_list[-1], bool):
             truncate = args_list.pop()
-        return [int(t) for t in args_list[:-1]], args_list[-1], truncate
-    return [int(t) for t in args], value, truncate
+        return args_list[:-1], args_list[-1], truncate
+    return args, value, truncate
 
 
 class CliffordMixin:
diff --git a/ppvm-python/test/benchmarks/test_msd.py b/ppvm-python/test/benchmarks/test_msd.py
new file mode 100644
index 00000000..e9b5a55f
--- /dev/null
+++ b/ppvm-python/test/benchmarks/test_msd.py
@@ -0,0 +1,132 @@
+"""pytest-benchmark mirror of crates/ppvm-tableau/benches/tableau-msd-fused.rs.
+
+Builds the 85-qubit magic-state-distillation circuit (5 code blocks of 17
+qubits) on a `GeneralizedTableau`, timing the circuit construction. Every gate
+is *splatted* -- applied to a collection of qubits in a single call, the Python
+equivalent of the Rust ``*_many`` / ``cz_block_pairs`` fused methods:
+single-qubit gates broadcast over each target, two-qubit gates consume
+consecutive pairs.
+
+Two arms, mirroring the Rust bench's two ``bench_function`` calls:
+
+* ``test_msd_fused`` -- circuit construction only (``msd_func_fused::<false>``).
+* ``test_msd_fused_measure`` -- a full shot: construction plus a single
+  ``measure_many`` readout of all 85 qubits (``msd_func_fused::<true>``). The
+  difference between the two arms isolates the measurement cost.
+
+Circuit from Rafael:
+https://www.notion.so/Simulating-85-qubit-MSD-circuit-using-stabilizer-rank-decomposition-and-pyzx-288f86eeff3c802fb262ef1cfa69dfae
+
+Run the timed benchmark with:
+
+    uv run --project ppvm-python --group dev pytest ppvm-python/test/benchmarks/test_msd.py --benchmark-enable
+Without ``--benchmark-enable`` it runs once as a smoke test (see ``addopts`` in
+pyproject.toml).
+"""
+
+import pytest
+
+from ppvm import GeneralizedTableau
+
+QUBITS_PER_CODE_BLOCK = 17
+N_BLOCKS = 5
+N_QUBITS = QUBITS_PER_CODE_BLOCK * N_BLOCKS  # 85
+
+
+def _at(qubits: list[int], idxs: list[int]) -> list[int]:
+    """Map block-local indices to absolute qubit addresses."""
+    return [qubits[i] for i in idxs]
+
+
+def _pairs(qubits: list[int], index_pairs: list[tuple[int, int]]) -> list[int]:
+    """Flatten block-local index pairs into a consecutive (a, b, ...) cz target list."""
+    return [qubits[i] for pair in index_pairs for i in pair]
+
+
+def encode(tab: GeneralizedTableau, qubits: list[int]) -> None:
+    if len(qubits) not in (7, 17):
+        raise ValueError(f"Unsupported number of qubits {len(qubits)}")
+
+    if len(qubits) == 7:
+        tab.sqrt_y_dag(_at(qubits, [0, 1, 2, 3, 4, 5]))
+        tab.cz(_pairs(qubits, [(1, 2), (3, 4), (5, 6)]))
+        tab.sqrt_y(qubits[6])
+        tab.cz(_pairs(qubits, [(0, 3), (2, 5), (4, 6)]))
+        tab.sqrt_y(_at(qubits, [2, 3, 4, 5, 6]))
+        tab.cz(_pairs(qubits, [(0, 1), (2, 3), (4, 5)]))
+        tab.sqrt_y(_at(qubits, [1, 2, 4]))
+        return
+
+    # len == 17
+    tab.sqrt_y(_at(qubits, [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16]))
+    tab.cz(_pairs(qubits, [(1, 3), (7, 10), (12, 14), (13, 16)]))
+    tab.sqrt_y_dag(_at(qubits, [7, 16]))
+    tab.cz(_pairs(qubits, [(4, 7), (8, 10), (11, 14), (15, 16)]))
+    tab.sqrt_y_dag(_at(qubits, [4, 10, 14, 16]))
+    tab.cz(_pairs(qubits, [(2, 4), (6, 8), (7, 9), (10, 13), (14, 16)]))
+    tab.sqrt_y(_at(qubits, [3, 6, 9, 10, 12, 13]))
+    tab.cz(_pairs(qubits, [(0, 2), (3, 6), (5, 8), (10, 12), (11, 13)]))
+    tab.sqrt_y(_at(qubits, [1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 14]))
+    tab.cz(_pairs(qubits, [(0, 1), (2, 3), (4, 5), (6, 7), (8, 9), (12, 15)]))
+    tab.sqrt_y_dag(_at(qubits, [0, 2, 5, 6, 8, 10, 12]))
+
+
+def build_msd() -> GeneralizedTableau:
+    """Construct the full splatted MSD circuit and return the tableau."""
+    tab = GeneralizedTableau(N_QUBITS)
+    qubit_addrs = list(range(N_QUBITS))
+
+    # Split qubits into N_BLOCKS code blocks of QUBITS_PER_CODE_BLOCK each.
+    ql = [
+        qubit_addrs[i * QUBITS_PER_CODE_BLOCK : (i + 1) * QUBITS_PER_CODE_BLOCK]
+        for i in range(N_BLOCKS)
+    ]
+
+    # Encoding: H + T on each block's encoding qubit, then encode the block.
+    for block in ql:
+        encoding_qubit = block[6] if len(block) == 7 else block[7]
+        tab.h(encoding_qubit)
+        tab.t(encoding_qubit)
+        encode(tab, block)
+
+    # Middle gates: sqrt_x / cz / sqrt_y / sqrt_x_dag layers, all splatted. The
+    # cross-block CZ layers entangle two contiguous registers (constant offset),
+    # so they use the word-fused cz_block instead of a per-pair cz -- this is the
+    # Python analogue of the Rust bench's cz_block_pairs / _cross_word calls.
+    block_len = QUBITS_PER_CODE_BLOCK
+    tab.sqrt_x(ql[0])
+    tab.sqrt_x(ql[1])
+    tab.sqrt_x(ql[4])
+    tab.cz_block(ql[0][0], ql[1][0], block_len)
+    tab.cz_block(ql[2][0], ql[3][0], block_len)
+    tab.sqrt_y(ql[0])
+    tab.sqrt_y(ql[3])
+    tab.cz_block(ql[0][0], ql[2][0], block_len)
+    tab.cz_block(ql[3][0], ql[4][0], block_len)
+    tab.sqrt_x_dag(ql[0])
+    tab.cz_block(ql[0][0], ql[4][0], block_len)
+    tab.cz_block(ql[1][0], ql[3][0], block_len)
+    for block in ql:
+        tab.sqrt_x_dag(block)
+
+    return tab
+
+
+def build_and_measure() -> list:
+    """Full shot: construct the circuit, then read out all qubits at once."""
+    tab = build_msd()
+    return tab.measure_many(range(N_QUBITS))
+
+
+@pytest.mark.benchmark(group="msd")
+def test_msd_fused(benchmark):
+    # Construct a fresh tableau and apply the whole splatted circuit each round,
+    # mirroring Rust's iter_batched_ref(|| {}, |_| msd_func_fused::<false>()).
+    benchmark(build_msd)
+
+
+@pytest.mark.benchmark(group="msd")
+def test_msd_fused_measure(benchmark):
+    # Construction plus a single measure_many readout of all 85 qubits, mirroring
+    # Rust's msd_func_fused::<true>(). One FFI call for the whole readout.
+    benchmark(build_and_measure)
diff --git a/skills/ppvm-usage/SKILL.md b/skills/ppvm-usage/SKILL.md
index 49bc2a0f..9d7180ed 100644
--- a/skills/ppvm-usage/SKILL.md
+++ b/skills/ppvm-usage/SKILL.md
@@ -48,7 +48,7 @@ Python hides all of this; the binding picks the variant automatically from `n_qu
 
 Non-Clifford gates *branch*: one Pauli term becomes a small linear combination. Without truncation, the sum grows unboundedly. Configure truncation at construction time, then apply it. **The when-to-apply rule differs by language:**
 
-- **Python**: the binding calls `truncate()` for you after every gate, so once you've passed the thresholds at construction time you don't touch `.truncate()` yourself. There is no `.truncate()` method on the Python `PauliSum` either.
+- **Python**: the binding calls `truncate()` for you after every gate method call by default, so once you've passed the thresholds at construction time you usually don't call `.truncate()` yourself. To compose several operations before pruning, pass `truncate=False` to those gate/noise calls, then call `ps.truncate()` once at the intended cut point.
 - **Rust**: `state.truncate()` is the user-driven trigger — gate methods do not call it for you. Call it at the points in your circuit where pruning makes sense (typically after each gate layer, or once per Trotter step). Without this call the policy you configured in the `Config` does nothing.
 
 **Python — kwargs on `PauliSum.new`:**
@@ -126,6 +126,8 @@ r0 = tab.measure(0)   # MeasurementResult.ZERO / .ONE / .LOST
 r1 = tab.measure(1)   # correlated with r0 (Bell state)
 ```
 
+For throughput on tableau circuits, batch same-gate layers in one Python call. Single-qubit gates accept variadic targets or a sequence: `tab.h(0, 2, 4)` and `tab.h([0, 2, 4])` are equivalent. Two-qubit gates consume a flat target list as consecutive pairs: `tab.cnot([0, 1, 2, 3])` applies `(0, 1)` and `(2, 3)`. Rotations and Pauli/depolarizing noise use the same convention with `theta=...` or `p=...`. This avoids one Python→Rust call per target and forwards to fused Rust tableau kernels internally. `measure` stays scalar; use `tab.measure_many([0, 1, 2])` for readout layers.
+
 Non-Clifford gates and Stim programs:
 
 ```python
@@ -202,6 +204,8 @@ tab.cnot(0, 1);
 let outcome = tab.measure(0);
 ```
 
+For layer-style tableau circuits in Rust, prefer explicit batch methods instead of per-target loops: `tab.h_many(&[0, 2, 4])`, `tab.cnot_many(&[(0, 1), (2, 3)])`, `tab.rx_many(&targets, theta)`, `tab.rzz_many(&pairs, theta)`, `tab.depolarize1_many(&targets, p)`, `tab.measure_many(&targets)`, `tab.reset_many(&targets)`, and the analogous `*_many` forms. `GeneralizedTableau` specializes these into fused bit operations. Other backends may expose trait-default `*_many` methods too, but the fused speedup is tableau-specific.
+
 Pick `IndexType` by qubit count: `usize` up to ~64, `u128` up to 128, `bnum::types::U256` / `U512` / `U1024` beyond. **Using `usize` past 64 qubits silently overflows** — this is the second-most-common bug after Heisenberg-order mistakes.
 
 ### Running Stim programs (Rust)
@@ -273,6 +277,7 @@ Important: the six off-diagonal two-qubit rotations (`rxy`, `rxz`, `ryx`, `ryz`,
 - `_dag` (not `_adj` or `_dagger`).
 - Prefer `p=...` and `theta=...` for readability in Python; trailing positional
   probabilities and angles are also accepted for compatibility.
+- Python tableau gate names do not grow a `_many` suffix. Pass multiple targets to the normal `GeneralizedTableau` gate (`tab.h([0, 1])`, `tab.rzz([0, 1, 2, 3], theta=...)`); use `_many` only in Rust and for Python `measure_many`.
 - The Python `PauliSum` is intentionally a narrow workhorse focused on noisy-circuit observables. For `t`, `u3`, `cy`, mid-circuit `measure`, or `reset`, use `GeneralizedTableau` (Python) or drop to Rust.
 
 ## Common pitfalls (rank-ordered by how often agents hit them)
@@ -280,10 +285,11 @@ Important: the six off-diagonal two-qubit rotations (`rxy`, `rxz`, `ryx`, `ryz`,
 1. **Forgot to reverse the gate order in Pauli propagation.** Symptom: expectation values look like the inverse circuit. Re-read §1.
 2. **Used `depolarizing`/`depolarize` or `_adj` from intuition.** Symptom: `AttributeError` / `no method named …`. Correct names are `depolarize1` and `_dag`.
 3. **Tried to import `CoefficientThreshold` / `MaxPauliWeight` from Python.** Those are Rust-only. Use kwargs on `PauliSum.new`.
-4. **`.truncate()` on the wrong side.** In Python, calling `.truncate()` raises `AttributeError` — the binding already truncates after every gate. In Rust, *not* calling `state.truncate()` means your configured policy never runs and the sum grows unboundedly. See §3 above.
-5. **`GeneralizedTableau::new(n)` in Rust.** It takes two args: `(n_qubits, coefficient_threshold)`.
-6. **`IndexType = usize` for >64 qubits.** Silently overflows. Use `u128` or a `bnum` type.
-7. **`pip install` in docs.** Project policy is `uv` everywhere — `uv add`, `uv run`, `uv sync`. Fix any pip references you find.
+4. **`.truncate()` on the wrong side.** In Python, truncation runs after each gate call by default; use `truncate=False` plus one later `ps.truncate()` only when you intentionally want to defer pruning. In Rust, *not* calling `state.truncate()` means your configured policy never runs and the sum grows unboundedly. See §3 above.
+5. **Looped over Python `GeneralizedTableau` targets one call at a time.** Batch tableau layers with normal Python gates (`tab.h([0, 1, 2])`, `tab.cz([0, 4, 1, 5])`) or `tab.measure_many(...)`; in Rust tableau code, use the matching `*_many` methods. Not for Python `PauliSum`.
+6. **`GeneralizedTableau::new(n)` in Rust.** It takes two args: `(n_qubits, coefficient_threshold)`.
+7. **`IndexType = usize` for >64 qubits.** Silently overflows. Use `u128` or a `bnum` type.
+8. **`pip install` in docs.** Project policy is `uv` everywhere — `uv add`, `uv run`, `uv sync`. Fix any pip references you find.
 
 ## Verifying you got the API right
 
diff --git a/skills/ppvm-usage/examples/python/noise_truncation.py b/skills/ppvm-usage/examples/python/noise_truncation.py
index 9c584192..5a9a6235 100644
--- a/skills/ppvm-usage/examples/python/noise_truncation.py
+++ b/skills/ppvm-usage/examples/python/noise_truncation.py
@@ -4,7 +4,7 @@
 A small-support observable propagated backwards through a noisy
 Trotter-style circuit. ``min_abs_coeff`` and ``max_pauli_weight`` are
 passed at construction; in Python the binding calls ``truncate()`` after
-every gate, so we don't manage it manually.
+each gate method call by default, so we don't manage it manually.
 
 The point of this example is the *workflow* — combining gates, noise,
 and bounded truncation — not a specific numeric outcome. We assert that