diff --git a/changelog.d/forbid-randomness-in-formulas.added.md b/changelog.d/forbid-randomness-in-formulas.added.md new file mode 100644 index 00000000..e651efdc --- /dev/null +++ b/changelog.d/forbid-randomness-in-formulas.added.md @@ -0,0 +1 @@ +Forbid random number generation inside variable formulas. A rules-engine formula must be a pure, deterministic function of its inputs, so any call to `numpy.random` or the standard library `random` module while a formula runs now raises `NonDeterministicFormulaError`. Stochastic inputs must be precomputed deterministically when building the dataset. Removes the per-variable `np.random.seed` previously applied before each calculation, which existed only to make formula-level randomness reproducible. diff --git a/policyengine_core/simulations/randomness_guard.py b/policyengine_core/simulations/randomness_guard.py new file mode 100644 index 00000000..3514b184 --- /dev/null +++ b/policyengine_core/simulations/randomness_guard.py @@ -0,0 +1,132 @@ +"""Forbid non-deterministic randomness inside variable formulas. + +A rules engine must be a pure function of its inputs: identical inputs must +always produce identical outputs. Calling a random number generator inside a +formula breaks that contract — the same household can get different results on +different runs — and makes whole datasets non-reproducible. + +While a formula is executing, this guard replaces the callables exposed by +``numpy.random`` and the standard library ``random`` module with functions that +raise :class:`NonDeterministicFormulaError`. This includes the RNG constructors +``np.random.default_rng``/``Generator``/``RandomState``, so *building* a +generator inside a formula — even a seeded one — also raises. Seeding does not +make randomness acceptable in a formula: stochastic inputs belong in the dataset +(computed once, deterministically, and stored), not in the formula. The guard is +re-entrant, so nested formula evaluation is handled correctly, and it restores +the originals once the outermost guarded formula returns. + +Known limitations (pinned by tests in ``test_randomness_guard`` so they cannot +drift silently): + +* A ``Generator``/``RandomState`` instance built *before* the formula runs + (e.g. ``rng = np.random.default_rng(0)`` at module scope) is not intercepted + when its methods are called inside the formula. ``numpy``'s generator classes + are immutable C extension types, so their bound methods cannot be patched. +* A *bare drawing function* bound into another module's namespace before the + guard installs (e.g. ``from numpy.random import random``) is not intercepted, + because the guard patches the ``numpy.random`` module attribute, not every + rebinding of it. + +Both require deliberately hoisting randomness out of the ``np.random.`` form; +use ``np.random.(...)`` or construct the generator inside the formula (both +caught) rather than importing or pre-building drawing callables. +""" + +from __future__ import annotations + +import random as _stdlib_random +from typing import Callable + +import numpy as np + + +class NonDeterministicFormulaError(RuntimeError): + """Raised when a formula invokes a random number generator.""" + + +# Public callables exposed by each randomness namespace, captured once at +# import so the per-formula swap is a cheap dict iteration rather than a +# fresh ``dir()`` scan. +def _public_callables(module) -> dict[str, Callable]: + return { + name: getattr(module, name) + for name in dir(module) + if not name.startswith("_") and callable(getattr(module, name)) + } + + +_GUARDED_NAMESPACES = ( + ("numpy.random", np.random, _public_callables(np.random)), + ("random", _stdlib_random, _public_callables(_stdlib_random)), +) + +# Re-entrancy bookkeeping: only the outermost guarded formula installs and +# removes the patches; the active variable name is tracked as a stack so the +# error message always names the formula that actually made the call. +_depth = 0 +_variable_stack: list[str] = [] + + +def _make_raiser(namespace: str, attribute: str) -> Callable: + qualified = f"{namespace}.{attribute}" + + def _raise(*args, **kwargs): + variable = _variable_stack[-1] if _variable_stack else "" + raise NonDeterministicFormulaError( + f"The formula for '{variable}' called {qualified}(), but rules-engine " + f"formulas must be deterministic functions of their inputs. Remove the " + f"random call. If you need a stochastic input, compute it once when " + f"building the dataset (with a seeded generator) and store it as an " + f"input variable instead." + ) + + return _raise + + +# Pre-build the raisers once per (namespace, attribute). +_RAISERS = { + id(module): {name: _make_raiser(namespace, name) for name in originals} + for namespace, module, originals in _GUARDED_NAMESPACES +} + + +def _install() -> None: + for _namespace, module, originals in _GUARDED_NAMESPACES: + raisers = _RAISERS[id(module)] + for name in originals: + setattr(module, name, raisers[name]) + + +def _restore() -> None: + for _namespace, module, originals in _GUARDED_NAMESPACES: + for name, original in originals.items(): + setattr(module, name, original) + + +class forbid_randomness: + """Context manager that bans RNG use while a formula runs. + + Re-entrant: nested formulas reuse the single installed patch set and only + the outermost context restores the originals. + """ + + __slots__ = ("variable_name",) + + def __init__(self, variable_name: str): + self.variable_name = variable_name + + def __enter__(self) -> "forbid_randomness": + global _depth + if _depth == 0: + _install() + _depth += 1 + _variable_stack.append(self.variable_name) + return self + + def __exit__(self, exc_type, exc_value, traceback) -> bool: + global _depth + _variable_stack.pop() + _depth -= 1 + if _depth == 0: + _restore() + return False diff --git a/policyengine_core/simulations/simulation.py b/policyengine_core/simulations/simulation.py index c655106c..3864a5dc 100644 --- a/policyengine_core/simulations/simulation.py +++ b/policyengine_core/simulations/simulation.py @@ -14,6 +14,7 @@ from policyengine_core.entities.entity import Entity from policyengine_core.enums import Enum, EnumArray from policyengine_core.errors import CycleError, SpiralError +from policyengine_core.simulations.randomness_guard import forbid_randomness from policyengine_core.holders.holder import Holder from policyengine_core.periods import Period from policyengine_core.periods.config import ETERNITY, MONTH, YEAR @@ -591,7 +592,9 @@ def calculate( self.tracer.record_calculation_start(variable_name, period, self.branch_name) - np.random.seed(_stable_hash_to_seed(variable_name + str(period))) + # No per-variable RNG seeding: formulas may not use randomness at all + # (enforced by forbid_randomness in _run_formula), so there is nothing + # to make reproducible here. try: result = self._calculate(variable_name, period) @@ -1102,10 +1105,14 @@ def _run_formula( self.tax_benefit_system.parameters.tracer = self.tracer parameters_at = self.tax_benefit_system.parameters - if formula.__code__.co_argcount == 2: - array = formula(population, period) - else: - array = formula(population, period, parameters_at) + # A rules-engine formula must be a pure, deterministic function of its + # inputs. Forbid any random number generation while it runs so the same + # inputs always produce the same outputs. + with forbid_randomness(variable.name): + if formula.__code__.co_argcount == 2: + array = formula(population, period) + else: + array = formula(population, period, parameters_at) return array diff --git a/tests/core/test_randomness_guard.py b/tests/core/test_randomness_guard.py new file mode 100644 index 00000000..f96443a2 --- /dev/null +++ b/tests/core/test_randomness_guard.py @@ -0,0 +1,189 @@ +"""A variable formula must not invoke a random number generator. + +Rules-engine formulas have to be deterministic functions of their inputs, so +calling ``numpy.random`` or the stdlib ``random`` module inside a formula raises +:class:`NonDeterministicFormulaError`. These tests pin that behaviour and verify +the guard restores the randomness namespaces afterwards. +""" + +import random +from numpy.random import random as _bare_random_import + +import numpy as np +import pytest + +# A generator hoisted to module scope, built before any formula runs. Its bound +# methods cannot be patched (numpy generator classes are immutable C types), so +# using it inside a formula is a known, pinned gap in the guard. +_PREBUILT_RNG = np.random.default_rng(0) + +from policyengine_core import periods +from policyengine_core.country_template import CountryTaxBenefitSystem, entities +from policyengine_core.simulations import SimulationBuilder +from policyengine_core.simulations.randomness_guard import ( + NonDeterministicFormulaError, + forbid_randomness, +) +from policyengine_core.variables import Variable + +PERIOD = "2013-01" + + +def _simulation_with(*variable_classes): + system = CountryTaxBenefitSystem() + system.add_variables(*variable_classes) + return SimulationBuilder().build_default_simulation(system) + + +class uses_numpy_random(Variable): + value_type = float + entity = entities.Person + definition_period = periods.MONTH + label = "formula that draws from numpy.random" + + def formula(person, period): + return np.random.random(person.count) + + +class uses_stdlib_random(Variable): + value_type = float + entity = entities.Person + definition_period = periods.MONTH + label = "formula that draws from the random module" + + def formula(person, period): + return random.random() + + +class uses_seeded_generator(Variable): + value_type = float + entity = entities.Person + definition_period = periods.MONTH + label = "formula that builds a seeded generator" + + def formula(person, period): + # Seeding does not make randomness acceptable inside a formula. + return np.random.default_rng(0).random(person.count) + + +class deterministic(Variable): + value_type = int + entity = entities.Person + definition_period = periods.MONTH + label = "deterministic formula" + + def formula(person, period): + return person.count + + +class raises_value_error(Variable): + value_type = float + entity = entities.Person + definition_period = periods.MONTH + label = "formula that raises a non-RNG exception" + + def formula(person, period): + raise ValueError("boom") + + +class uses_prebuilt_generator(Variable): + value_type = float + entity = entities.Person + definition_period = periods.MONTH + label = "formula that uses a module-scope generator (known gap)" + + def formula(person, period): + return _PREBUILT_RNG.random(person.count) + + +class uses_bare_imported_function(Variable): + value_type = float + entity = entities.Person + definition_period = periods.MONTH + label = "formula that uses a by-name imported drawing function (known gap)" + + def formula(person, period): + return _bare_random_import() + + +def test_numpy_random_in_formula_raises(): + simulation = _simulation_with(uses_numpy_random) + with pytest.raises(NonDeterministicFormulaError, match="uses_numpy_random"): + simulation.calculate("uses_numpy_random", PERIOD) + + +def test_stdlib_random_in_formula_raises(): + simulation = _simulation_with(uses_stdlib_random) + with pytest.raises(NonDeterministicFormulaError, match="random"): + simulation.calculate("uses_stdlib_random", PERIOD) + + +def test_seeded_generator_in_formula_still_raises(): + simulation = _simulation_with(uses_seeded_generator) + with pytest.raises(NonDeterministicFormulaError): + simulation.calculate("uses_seeded_generator", PERIOD) + + +def test_deterministic_formula_is_unaffected(): + simulation = _simulation_with(deterministic) + result = simulation.calculate("deterministic", PERIOD) + assert (result == 1).all() + + +def test_randomness_restored_after_guarded_formula(): + simulation = _simulation_with(uses_numpy_random) + with pytest.raises(NonDeterministicFormulaError): + simulation.calculate("uses_numpy_random", PERIOD) + # Outside any formula, numpy and stdlib randomness work normally again. + assert isinstance(float(np.random.random()), float) + assert isinstance(random.random(), float) + + +def test_randomness_restored_after_non_rng_exception_in_formula(): + # If a formula raises a normal exception, _run_formula's guarded block must + # still restore the randomness namespaces (no leak of the patched state). + simulation = _simulation_with(raises_value_error) + with pytest.raises(ValueError, match="boom"): + simulation.calculate("raises_value_error", PERIOD) + assert isinstance(float(np.random.random()), float) + assert isinstance(random.random(), float) + + +def test_constructing_generator_inside_formula_is_caught(): + # Building a generator inside the formula hits the patched np.random + # constructor, so even this seeded form raises (covered by + # uses_seeded_generator too; kept explicit for the boundary). + simulation = _simulation_with(uses_seeded_generator) + with pytest.raises(NonDeterministicFormulaError): + simulation.calculate("uses_seeded_generator", PERIOD) + + +def test_prebuilt_generator_is_a_known_gap(): + # Documented limitation: a generator built before the formula runs cannot be + # intercepted (numpy generator classes are immutable). Pin the behaviour so a + # future change to it is noticed. + simulation = _simulation_with(uses_prebuilt_generator) + result = simulation.calculate("uses_prebuilt_generator", PERIOD) + assert result is not None + + +def test_by_name_imported_function_is_a_known_gap(): + # Documented limitation: a drawing function imported by name before the guard + # installs is not intercepted. Pin the behaviour. + simulation = _simulation_with(uses_bare_imported_function) + result = simulation.calculate("uses_bare_imported_function", PERIOD) + assert result is not None + + +def test_guard_is_reentrant(): + # Entering twice and leaving the inner context must not restore the + # originals while the outer context is still active. + with forbid_randomness("outer"): + with forbid_randomness("inner"): + with pytest.raises(NonDeterministicFormulaError, match="inner"): + np.random.random() + # Still guarded: the outer context owns the patch. + with pytest.raises(NonDeterministicFormulaError, match="outer"): + np.random.random() + # Fully restored once the outermost context exits. + assert isinstance(float(np.random.random()), float)