From 4c78280c0b590c98d666444bf866418ebcd676e4 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 4 Jun 2026 11:27:09 +0100 Subject: [PATCH 1/2] Forbid randomness inside variable formulas A rules-engine formula must be a pure, deterministic function of its inputs: identical inputs must always produce identical outputs. Calling a random number generator inside a formula breaks that contract and makes datasets non-reproducible (a property_purchased assignment built on unseeded np.random recently spiked a UK income decile's tax rate and blocked data releases for ~2 weeks). While a formula runs, forbid_randomness replaces the public callables of numpy.random and the stdlib random module with functions that raise NonDeterministicFormulaError. Seeding does not make randomness acceptable in a formula; stochastic inputs must be precomputed deterministically when building the dataset and stored as inputs. The guard is re-entrant and restores the namespaces once the outermost formula returns. Removes the per-variable np.random.seed previously applied at the top of calculate(), which existed only to make formula-level randomness reproducible and now conflicts with the guard. Core-internal seeding in __init__ (for non-formula randomness) is unchanged. No formula in policyengine-uk or policyengine-us uses randomness, so this is enforcement of an already-held invariant. Full core suite passes (580 tests). --- .../forbid-randomness-in-formulas.added.md | 1 + .../simulations/randomness_guard.py | 114 +++++++++++++++++ policyengine_core/simulations/simulation.py | 17 ++- tests/core/test_randomness_guard.py | 117 ++++++++++++++++++ 4 files changed, 244 insertions(+), 5 deletions(-) create mode 100644 changelog.d/forbid-randomness-in-formulas.added.md create mode 100644 policyengine_core/simulations/randomness_guard.py create mode 100644 tests/core/test_randomness_guard.py diff --git a/changelog.d/forbid-randomness-in-formulas.added.md b/changelog.d/forbid-randomness-in-formulas.added.md new file mode 100644 index 00000000..e651efdc --- /dev/null +++ b/changelog.d/forbid-randomness-in-formulas.added.md @@ -0,0 +1 @@ +Forbid random number generation inside variable formulas. A rules-engine formula must be a pure, deterministic function of its inputs, so any call to `numpy.random` or the standard library `random` module while a formula runs now raises `NonDeterministicFormulaError`. Stochastic inputs must be precomputed deterministically when building the dataset. Removes the per-variable `np.random.seed` previously applied before each calculation, which existed only to make formula-level randomness reproducible. diff --git a/policyengine_core/simulations/randomness_guard.py b/policyengine_core/simulations/randomness_guard.py new file mode 100644 index 00000000..7e0c7bed --- /dev/null +++ b/policyengine_core/simulations/randomness_guard.py @@ -0,0 +1,114 @@ +"""Forbid non-deterministic randomness inside variable formulas. + +A rules engine must be a pure function of its inputs: identical inputs must +always produce identical outputs. Calling a random number generator inside a +formula breaks that contract — the same household can get different results on +different runs — and makes whole datasets non-reproducible. + +While a formula is executing, this guard replaces the callables exposed by +``numpy.random`` and the standard library ``random`` module with functions that +raise :class:`NonDeterministicFormulaError`. Seeding does not make this +acceptable: stochastic inputs belong in the dataset (computed once, +deterministically, and stored), not in the formula. The guard is re-entrant, so +nested formula evaluation is handled correctly, and it restores the original +callables once the outermost guarded formula returns. +""" + +from __future__ import annotations + +import random as _stdlib_random +from typing import Callable + +import numpy as np + + +class NonDeterministicFormulaError(RuntimeError): + """Raised when a formula invokes a random number generator.""" + + +# Public callables exposed by each randomness namespace, captured once at +# import so the per-formula swap is a cheap dict iteration rather than a +# fresh ``dir()`` scan. +def _public_callables(module) -> dict[str, Callable]: + return { + name: getattr(module, name) + for name in dir(module) + if not name.startswith("_") and callable(getattr(module, name)) + } + + +_GUARDED_NAMESPACES = ( + ("numpy.random", np.random, _public_callables(np.random)), + ("random", _stdlib_random, _public_callables(_stdlib_random)), +) + +# Re-entrancy bookkeeping: only the outermost guarded formula installs and +# removes the patches; the active variable name is tracked as a stack so the +# error message always names the formula that actually made the call. +_depth = 0 +_variable_stack: list[str] = [] + + +def _make_raiser(namespace: str, attribute: str) -> Callable: + qualified = f"{namespace}.{attribute}" + + def _raise(*args, **kwargs): + variable = _variable_stack[-1] if _variable_stack else "" + raise NonDeterministicFormulaError( + f"The formula for '{variable}' called {qualified}(), but rules-engine " + f"formulas must be deterministic functions of their inputs. Remove the " + f"random call. If you need a stochastic input, compute it once when " + f"building the dataset (with a seeded generator) and store it as an " + f"input variable instead." + ) + + return _raise + + +# Pre-build the raisers once per (namespace, attribute). +_RAISERS = { + id(module): {name: _make_raiser(namespace, name) for name in originals} + for namespace, module, originals in _GUARDED_NAMESPACES +} + + +def _install() -> None: + for _namespace, module, originals in _GUARDED_NAMESPACES: + raisers = _RAISERS[id(module)] + for name in originals: + setattr(module, name, raisers[name]) + + +def _restore() -> None: + for _namespace, module, originals in _GUARDED_NAMESPACES: + for name, original in originals.items(): + setattr(module, name, original) + + +class forbid_randomness: + """Context manager that bans RNG use while a formula runs. + + Re-entrant: nested formulas reuse the single installed patch set and only + the outermost context restores the originals. + """ + + __slots__ = ("variable_name",) + + def __init__(self, variable_name: str): + self.variable_name = variable_name + + def __enter__(self) -> "forbid_randomness": + global _depth + if _depth == 0: + _install() + _depth += 1 + _variable_stack.append(self.variable_name) + return self + + def __exit__(self, exc_type, exc_value, traceback) -> bool: + global _depth + _variable_stack.pop() + _depth -= 1 + if _depth == 0: + _restore() + return False diff --git a/policyengine_core/simulations/simulation.py b/policyengine_core/simulations/simulation.py index c655106c..3864a5dc 100644 --- a/policyengine_core/simulations/simulation.py +++ b/policyengine_core/simulations/simulation.py @@ -14,6 +14,7 @@ from policyengine_core.entities.entity import Entity from policyengine_core.enums import Enum, EnumArray from policyengine_core.errors import CycleError, SpiralError +from policyengine_core.simulations.randomness_guard import forbid_randomness from policyengine_core.holders.holder import Holder from policyengine_core.periods import Period from policyengine_core.periods.config import ETERNITY, MONTH, YEAR @@ -591,7 +592,9 @@ def calculate( self.tracer.record_calculation_start(variable_name, period, self.branch_name) - np.random.seed(_stable_hash_to_seed(variable_name + str(period))) + # No per-variable RNG seeding: formulas may not use randomness at all + # (enforced by forbid_randomness in _run_formula), so there is nothing + # to make reproducible here. try: result = self._calculate(variable_name, period) @@ -1102,10 +1105,14 @@ def _run_formula( self.tax_benefit_system.parameters.tracer = self.tracer parameters_at = self.tax_benefit_system.parameters - if formula.__code__.co_argcount == 2: - array = formula(population, period) - else: - array = formula(population, period, parameters_at) + # A rules-engine formula must be a pure, deterministic function of its + # inputs. Forbid any random number generation while it runs so the same + # inputs always produce the same outputs. + with forbid_randomness(variable.name): + if formula.__code__.co_argcount == 2: + array = formula(population, period) + else: + array = formula(population, period, parameters_at) return array diff --git a/tests/core/test_randomness_guard.py b/tests/core/test_randomness_guard.py new file mode 100644 index 00000000..46f06de8 --- /dev/null +++ b/tests/core/test_randomness_guard.py @@ -0,0 +1,117 @@ +"""A variable formula must not invoke a random number generator. + +Rules-engine formulas have to be deterministic functions of their inputs, so +calling ``numpy.random`` or the stdlib ``random`` module inside a formula raises +:class:`NonDeterministicFormulaError`. These tests pin that behaviour and verify +the guard restores the randomness namespaces afterwards. +""" + +import random + +import numpy as np +import pytest + +from policyengine_core import periods +from policyengine_core.country_template import CountryTaxBenefitSystem, entities +from policyengine_core.simulations import SimulationBuilder +from policyengine_core.simulations.randomness_guard import ( + NonDeterministicFormulaError, + forbid_randomness, +) +from policyengine_core.variables import Variable + +PERIOD = "2013-01" + + +def _simulation_with(*variable_classes): + system = CountryTaxBenefitSystem() + system.add_variables(*variable_classes) + return SimulationBuilder().build_default_simulation(system) + + +class uses_numpy_random(Variable): + value_type = float + entity = entities.Person + definition_period = periods.MONTH + label = "formula that draws from numpy.random" + + def formula(person, period): + return np.random.random(person.count) + + +class uses_stdlib_random(Variable): + value_type = float + entity = entities.Person + definition_period = periods.MONTH + label = "formula that draws from the random module" + + def formula(person, period): + return random.random() + + +class uses_seeded_generator(Variable): + value_type = float + entity = entities.Person + definition_period = periods.MONTH + label = "formula that builds a seeded generator" + + def formula(person, period): + # Seeding does not make randomness acceptable inside a formula. + return np.random.default_rng(0).random(person.count) + + +class deterministic(Variable): + value_type = int + entity = entities.Person + definition_period = periods.MONTH + label = "deterministic formula" + + def formula(person, period): + return person.count + + +def test_numpy_random_in_formula_raises(): + simulation = _simulation_with(uses_numpy_random) + with pytest.raises(NonDeterministicFormulaError, match="uses_numpy_random"): + simulation.calculate("uses_numpy_random", PERIOD) + + +def test_stdlib_random_in_formula_raises(): + simulation = _simulation_with(uses_stdlib_random) + with pytest.raises(NonDeterministicFormulaError, match="random"): + simulation.calculate("uses_stdlib_random", PERIOD) + + +def test_seeded_generator_in_formula_still_raises(): + simulation = _simulation_with(uses_seeded_generator) + with pytest.raises(NonDeterministicFormulaError): + simulation.calculate("uses_seeded_generator", PERIOD) + + +def test_deterministic_formula_is_unaffected(): + simulation = _simulation_with(deterministic) + result = simulation.calculate("deterministic", PERIOD) + assert (result == 1).all() + + +def test_randomness_restored_after_guarded_formula(): + simulation = _simulation_with(uses_numpy_random) + with pytest.raises(NonDeterministicFormulaError): + simulation.calculate("uses_numpy_random", PERIOD) + # Outside any formula, numpy and stdlib randomness work normally again. + assert isinstance(float(np.random.random()), float) + assert isinstance(random.random(), float) + + +def test_guard_is_reentrant(): + # Entering twice and leaving the inner context must not restore the + # originals while the outer context is still active. + with forbid_randomness("outer"): + with forbid_randomness("inner"): + with pytest.raises(NonDeterministicFormulaError, match="inner"): + np.random.random() + # Still guarded: the outer context owns the patch. + with pytest.raises(NonDeterministicFormulaError, match="outer"): + np.random.random() + # Fully restored once the outermost context exits. + assert isinstance(float(np.random.random()), float) From f69bc35ee03b29a328a08887476ec652aa1c4468 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 4 Jun 2026 11:42:31 +0100 Subject: [PATCH 2/2] Fix review findings: document RNG-guard limits, pin with tests Independent review of the determinism guard surfaced two interception gaps and missing exception-safety coverage: - A generator hoisted to module scope (rng = np.random.default_rng(0)) and used inside a formula is not caught: numpy's generator classes are immutable C extension types, so their bound methods cannot be patched. (Building a generator inside the formula IS caught, since the np.random.default_rng constructor is patched.) - A drawing function imported by name before the guard installs (from numpy.random import random) is not caught. Neither is closeable without heavy machinery, and no PolicyEngine formula uses randomness, so document both accurately in the module docstring and pin them with tests so the boundary cannot drift silently. Corrects an earlier draft docstring that wrongly claimed the generator classes were patched. Adds tests: - randomness namespaces are restored after a formula raises a non-RNG exception (pins _run_formula guarded-block exception safety), - constructing a generator inside a formula raises, - the two known gaps behave as documented. --- .../simulations/randomness_guard.py | 28 ++++++-- tests/core/test_randomness_guard.py | 72 +++++++++++++++++++ 2 files changed, 95 insertions(+), 5 deletions(-) diff --git a/policyengine_core/simulations/randomness_guard.py b/policyengine_core/simulations/randomness_guard.py index 7e0c7bed..3514b184 100644 --- a/policyengine_core/simulations/randomness_guard.py +++ b/policyengine_core/simulations/randomness_guard.py @@ -7,11 +7,29 @@ While a formula is executing, this guard replaces the callables exposed by ``numpy.random`` and the standard library ``random`` module with functions that -raise :class:`NonDeterministicFormulaError`. Seeding does not make this -acceptable: stochastic inputs belong in the dataset (computed once, -deterministically, and stored), not in the formula. The guard is re-entrant, so -nested formula evaluation is handled correctly, and it restores the original -callables once the outermost guarded formula returns. +raise :class:`NonDeterministicFormulaError`. This includes the RNG constructors +``np.random.default_rng``/``Generator``/``RandomState``, so *building* a +generator inside a formula — even a seeded one — also raises. Seeding does not +make randomness acceptable in a formula: stochastic inputs belong in the dataset +(computed once, deterministically, and stored), not in the formula. The guard is +re-entrant, so nested formula evaluation is handled correctly, and it restores +the originals once the outermost guarded formula returns. + +Known limitations (pinned by tests in ``test_randomness_guard`` so they cannot +drift silently): + +* A ``Generator``/``RandomState`` instance built *before* the formula runs + (e.g. ``rng = np.random.default_rng(0)`` at module scope) is not intercepted + when its methods are called inside the formula. ``numpy``'s generator classes + are immutable C extension types, so their bound methods cannot be patched. +* A *bare drawing function* bound into another module's namespace before the + guard installs (e.g. ``from numpy.random import random``) is not intercepted, + because the guard patches the ``numpy.random`` module attribute, not every + rebinding of it. + +Both require deliberately hoisting randomness out of the ``np.random.`` form; +use ``np.random.(...)`` or construct the generator inside the formula (both +caught) rather than importing or pre-building drawing callables. """ from __future__ import annotations diff --git a/tests/core/test_randomness_guard.py b/tests/core/test_randomness_guard.py index 46f06de8..f96443a2 100644 --- a/tests/core/test_randomness_guard.py +++ b/tests/core/test_randomness_guard.py @@ -7,10 +7,16 @@ """ import random +from numpy.random import random as _bare_random_import import numpy as np import pytest +# A generator hoisted to module scope, built before any formula runs. Its bound +# methods cannot be patched (numpy generator classes are immutable C types), so +# using it inside a formula is a known, pinned gap in the guard. +_PREBUILT_RNG = np.random.default_rng(0) + from policyengine_core import periods from policyengine_core.country_template import CountryTaxBenefitSystem, entities from policyengine_core.simulations import SimulationBuilder @@ -70,6 +76,36 @@ def formula(person, period): return person.count +class raises_value_error(Variable): + value_type = float + entity = entities.Person + definition_period = periods.MONTH + label = "formula that raises a non-RNG exception" + + def formula(person, period): + raise ValueError("boom") + + +class uses_prebuilt_generator(Variable): + value_type = float + entity = entities.Person + definition_period = periods.MONTH + label = "formula that uses a module-scope generator (known gap)" + + def formula(person, period): + return _PREBUILT_RNG.random(person.count) + + +class uses_bare_imported_function(Variable): + value_type = float + entity = entities.Person + definition_period = periods.MONTH + label = "formula that uses a by-name imported drawing function (known gap)" + + def formula(person, period): + return _bare_random_import() + + def test_numpy_random_in_formula_raises(): simulation = _simulation_with(uses_numpy_random) with pytest.raises(NonDeterministicFormulaError, match="uses_numpy_random"): @@ -103,6 +139,42 @@ def test_randomness_restored_after_guarded_formula(): assert isinstance(random.random(), float) +def test_randomness_restored_after_non_rng_exception_in_formula(): + # If a formula raises a normal exception, _run_formula's guarded block must + # still restore the randomness namespaces (no leak of the patched state). + simulation = _simulation_with(raises_value_error) + with pytest.raises(ValueError, match="boom"): + simulation.calculate("raises_value_error", PERIOD) + assert isinstance(float(np.random.random()), float) + assert isinstance(random.random(), float) + + +def test_constructing_generator_inside_formula_is_caught(): + # Building a generator inside the formula hits the patched np.random + # constructor, so even this seeded form raises (covered by + # uses_seeded_generator too; kept explicit for the boundary). + simulation = _simulation_with(uses_seeded_generator) + with pytest.raises(NonDeterministicFormulaError): + simulation.calculate("uses_seeded_generator", PERIOD) + + +def test_prebuilt_generator_is_a_known_gap(): + # Documented limitation: a generator built before the formula runs cannot be + # intercepted (numpy generator classes are immutable). Pin the behaviour so a + # future change to it is noticed. + simulation = _simulation_with(uses_prebuilt_generator) + result = simulation.calculate("uses_prebuilt_generator", PERIOD) + assert result is not None + + +def test_by_name_imported_function_is_a_known_gap(): + # Documented limitation: a drawing function imported by name before the guard + # installs is not intercepted. Pin the behaviour. + simulation = _simulation_with(uses_bare_imported_function) + result = simulation.calculate("uses_bare_imported_function", PERIOD) + assert result is not None + + def test_guard_is_reentrant(): # Entering twice and leaving the inner context must not restore the # originals while the outer context is still active.