Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/forbid-randomness-in-formulas.added.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Forbid random number generation inside variable formulas. A rules-engine formula must be a pure, deterministic function of its inputs, so any call to `numpy.random` or the standard library `random` module while a formula runs now raises `NonDeterministicFormulaError`. Stochastic inputs must be precomputed deterministically when building the dataset. Removes the per-variable `np.random.seed` previously applied before each calculation, which existed only to make formula-level randomness reproducible.
132 changes: 132 additions & 0 deletions policyengine_core/simulations/randomness_guard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
"""Forbid non-deterministic randomness inside variable formulas.

A rules engine must be a pure function of its inputs: identical inputs must
always produce identical outputs. Calling a random number generator inside a
formula breaks that contract — the same household can get different results on
different runs — and makes whole datasets non-reproducible.

While a formula is executing, this guard replaces the callables exposed by
``numpy.random`` and the standard library ``random`` module with functions that
raise :class:`NonDeterministicFormulaError`. This includes the RNG constructors
``np.random.default_rng``/``Generator``/``RandomState``, so *building* a
generator inside a formula — even a seeded one — also raises. Seeding does not
make randomness acceptable in a formula: stochastic inputs belong in the dataset
(computed once, deterministically, and stored), not in the formula. The guard is
re-entrant, so nested formula evaluation is handled correctly, and it restores
the originals once the outermost guarded formula returns.

Known limitations (pinned by tests in ``test_randomness_guard`` so they cannot
drift silently):

* A ``Generator``/``RandomState`` instance built *before* the formula runs
(e.g. ``rng = np.random.default_rng(0)`` at module scope) is not intercepted
when its methods are called inside the formula. ``numpy``'s generator classes
are immutable C extension types, so their bound methods cannot be patched.
* A *bare drawing function* bound into another module's namespace before the
guard installs (e.g. ``from numpy.random import random``) is not intercepted,
because the guard patches the ``numpy.random`` module attribute, not every
rebinding of it.

Both require deliberately hoisting randomness out of the ``np.random.<fn>`` form;
use ``np.random.<fn>(...)`` or construct the generator inside the formula (both
caught) rather than importing or pre-building drawing callables.
"""

from __future__ import annotations

import random as _stdlib_random
from typing import Callable

import numpy as np


class NonDeterministicFormulaError(RuntimeError):
"""Raised when a formula invokes a random number generator."""


# Public callables exposed by each randomness namespace, captured once at
# import so the per-formula swap is a cheap dict iteration rather than a
# fresh ``dir()`` scan.
def _public_callables(module) -> dict[str, Callable]:
return {
name: getattr(module, name)
for name in dir(module)
if not name.startswith("_") and callable(getattr(module, name))
}


_GUARDED_NAMESPACES = (
("numpy.random", np.random, _public_callables(np.random)),
("random", _stdlib_random, _public_callables(_stdlib_random)),
)

# Re-entrancy bookkeeping: only the outermost guarded formula installs and
# removes the patches; the active variable name is tracked as a stack so the
# error message always names the formula that actually made the call.
_depth = 0
_variable_stack: list[str] = []


def _make_raiser(namespace: str, attribute: str) -> Callable:
qualified = f"{namespace}.{attribute}"

def _raise(*args, **kwargs):
variable = _variable_stack[-1] if _variable_stack else "<unknown>"
raise NonDeterministicFormulaError(
f"The formula for '{variable}' called {qualified}(), but rules-engine "
f"formulas must be deterministic functions of their inputs. Remove the "
f"random call. If you need a stochastic input, compute it once when "
f"building the dataset (with a seeded generator) and store it as an "
f"input variable instead."
)

return _raise


# Pre-build the raisers once per (namespace, attribute).
_RAISERS = {
id(module): {name: _make_raiser(namespace, name) for name in originals}
for namespace, module, originals in _GUARDED_NAMESPACES
}


def _install() -> None:
for _namespace, module, originals in _GUARDED_NAMESPACES:
raisers = _RAISERS[id(module)]
for name in originals:
setattr(module, name, raisers[name])


def _restore() -> None:
for _namespace, module, originals in _GUARDED_NAMESPACES:
for name, original in originals.items():
setattr(module, name, original)


class forbid_randomness:
"""Context manager that bans RNG use while a formula runs.

Re-entrant: nested formulas reuse the single installed patch set and only
the outermost context restores the originals.
"""

__slots__ = ("variable_name",)

def __init__(self, variable_name: str):
self.variable_name = variable_name

def __enter__(self) -> "forbid_randomness":
global _depth
if _depth == 0:
_install()
_depth += 1
_variable_stack.append(self.variable_name)
return self

def __exit__(self, exc_type, exc_value, traceback) -> bool:
global _depth
_variable_stack.pop()
_depth -= 1
if _depth == 0:
_restore()
return False
17 changes: 12 additions & 5 deletions policyengine_core/simulations/simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from policyengine_core.entities.entity import Entity
from policyengine_core.enums import Enum, EnumArray
from policyengine_core.errors import CycleError, SpiralError
from policyengine_core.simulations.randomness_guard import forbid_randomness
from policyengine_core.holders.holder import Holder
from policyengine_core.periods import Period
from policyengine_core.periods.config import ETERNITY, MONTH, YEAR
Expand Down Expand Up @@ -591,7 +592,9 @@ def calculate(

self.tracer.record_calculation_start(variable_name, period, self.branch_name)

np.random.seed(_stable_hash_to_seed(variable_name + str(period)))
# No per-variable RNG seeding: formulas may not use randomness at all
# (enforced by forbid_randomness in _run_formula), so there is nothing
# to make reproducible here.

try:
result = self._calculate(variable_name, period)
Expand Down Expand Up @@ -1102,10 +1105,14 @@ def _run_formula(
self.tax_benefit_system.parameters.tracer = self.tracer
parameters_at = self.tax_benefit_system.parameters

if formula.__code__.co_argcount == 2:
array = formula(population, period)
else:
array = formula(population, period, parameters_at)
# A rules-engine formula must be a pure, deterministic function of its
# inputs. Forbid any random number generation while it runs so the same
# inputs always produce the same outputs.
with forbid_randomness(variable.name):
if formula.__code__.co_argcount == 2:
array = formula(population, period)
else:
array = formula(population, period, parameters_at)

return array

Expand Down
189 changes: 189 additions & 0 deletions tests/core/test_randomness_guard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
"""A variable formula must not invoke a random number generator.

Rules-engine formulas have to be deterministic functions of their inputs, so
calling ``numpy.random`` or the stdlib ``random`` module inside a formula raises
:class:`NonDeterministicFormulaError`. These tests pin that behaviour and verify
the guard restores the randomness namespaces afterwards.
"""

import random
from numpy.random import random as _bare_random_import

import numpy as np
import pytest

# A generator hoisted to module scope, built before any formula runs. Its bound
# methods cannot be patched (numpy generator classes are immutable C types), so
# using it inside a formula is a known, pinned gap in the guard.
_PREBUILT_RNG = np.random.default_rng(0)

from policyengine_core import periods
from policyengine_core.country_template import CountryTaxBenefitSystem, entities
from policyengine_core.simulations import SimulationBuilder
from policyengine_core.simulations.randomness_guard import (
NonDeterministicFormulaError,
forbid_randomness,
)
from policyengine_core.variables import Variable

PERIOD = "2013-01"


def _simulation_with(*variable_classes):
system = CountryTaxBenefitSystem()
system.add_variables(*variable_classes)
return SimulationBuilder().build_default_simulation(system)


class uses_numpy_random(Variable):
value_type = float
entity = entities.Person
definition_period = periods.MONTH
label = "formula that draws from numpy.random"

def formula(person, period):
return np.random.random(person.count)


class uses_stdlib_random(Variable):
value_type = float
entity = entities.Person
definition_period = periods.MONTH
label = "formula that draws from the random module"

def formula(person, period):
return random.random()


class uses_seeded_generator(Variable):
value_type = float
entity = entities.Person
definition_period = periods.MONTH
label = "formula that builds a seeded generator"

def formula(person, period):
# Seeding does not make randomness acceptable inside a formula.
return np.random.default_rng(0).random(person.count)


class deterministic(Variable):
value_type = int
entity = entities.Person
definition_period = periods.MONTH
label = "deterministic formula"

def formula(person, period):
return person.count


class raises_value_error(Variable):
value_type = float
entity = entities.Person
definition_period = periods.MONTH
label = "formula that raises a non-RNG exception"

def formula(person, period):
raise ValueError("boom")


class uses_prebuilt_generator(Variable):
value_type = float
entity = entities.Person
definition_period = periods.MONTH
label = "formula that uses a module-scope generator (known gap)"

def formula(person, period):
return _PREBUILT_RNG.random(person.count)


class uses_bare_imported_function(Variable):
value_type = float
entity = entities.Person
definition_period = periods.MONTH
label = "formula that uses a by-name imported drawing function (known gap)"

def formula(person, period):
return _bare_random_import()


def test_numpy_random_in_formula_raises():
simulation = _simulation_with(uses_numpy_random)
with pytest.raises(NonDeterministicFormulaError, match="uses_numpy_random"):
simulation.calculate("uses_numpy_random", PERIOD)


def test_stdlib_random_in_formula_raises():
simulation = _simulation_with(uses_stdlib_random)
with pytest.raises(NonDeterministicFormulaError, match="random"):
simulation.calculate("uses_stdlib_random", PERIOD)


def test_seeded_generator_in_formula_still_raises():
simulation = _simulation_with(uses_seeded_generator)
with pytest.raises(NonDeterministicFormulaError):
simulation.calculate("uses_seeded_generator", PERIOD)


def test_deterministic_formula_is_unaffected():
simulation = _simulation_with(deterministic)
result = simulation.calculate("deterministic", PERIOD)
assert (result == 1).all()


def test_randomness_restored_after_guarded_formula():
simulation = _simulation_with(uses_numpy_random)
with pytest.raises(NonDeterministicFormulaError):
simulation.calculate("uses_numpy_random", PERIOD)
# Outside any formula, numpy and stdlib randomness work normally again.
assert isinstance(float(np.random.random()), float)
assert isinstance(random.random(), float)


def test_randomness_restored_after_non_rng_exception_in_formula():
# If a formula raises a normal exception, _run_formula's guarded block must
# still restore the randomness namespaces (no leak of the patched state).
simulation = _simulation_with(raises_value_error)
with pytest.raises(ValueError, match="boom"):
simulation.calculate("raises_value_error", PERIOD)
assert isinstance(float(np.random.random()), float)
assert isinstance(random.random(), float)


def test_constructing_generator_inside_formula_is_caught():
# Building a generator inside the formula hits the patched np.random
# constructor, so even this seeded form raises (covered by
# uses_seeded_generator too; kept explicit for the boundary).
simulation = _simulation_with(uses_seeded_generator)
with pytest.raises(NonDeterministicFormulaError):
simulation.calculate("uses_seeded_generator", PERIOD)


def test_prebuilt_generator_is_a_known_gap():
# Documented limitation: a generator built before the formula runs cannot be
# intercepted (numpy generator classes are immutable). Pin the behaviour so a
# future change to it is noticed.
simulation = _simulation_with(uses_prebuilt_generator)
result = simulation.calculate("uses_prebuilt_generator", PERIOD)
assert result is not None


def test_by_name_imported_function_is_a_known_gap():
# Documented limitation: a drawing function imported by name before the guard
# installs is not intercepted. Pin the behaviour.
simulation = _simulation_with(uses_bare_imported_function)
result = simulation.calculate("uses_bare_imported_function", PERIOD)
assert result is not None


def test_guard_is_reentrant():
# Entering twice and leaving the inner context must not restore the
# originals while the outer context is still active.
with forbid_randomness("outer"):
with forbid_randomness("inner"):
with pytest.raises(NonDeterministicFormulaError, match="inner"):
np.random.random()
# Still guarded: the outer context owns the patch.
with pytest.raises(NonDeterministicFormulaError, match="outer"):
np.random.random()
# Fully restored once the outermost context exits.
assert isinstance(float(np.random.random()), float)
Loading